From 1eb7cc3017a6def34fb448781578888764d1e659 Mon Sep 17 00:00:00 2001
From: Jim O'Regan <joregan@gmail.com>
Date: Mon, 26 Jun 2017 21:24:55 +0100
Subject: [PATCH 01/90] attempt a port from #1147

---
 spacy/lang/ga/__init__.py             |  24 ++++++
 spacy/lang/ga/stop_words.py           |  45 ++++++++++
 spacy/lang/ga/tokenizer_exceptions.py | 115 ++++++++++++++++++++++++++
 3 files changed, 184 insertions(+)
 create mode 100644 spacy/lang/ga/__init__.py
 create mode 100644 spacy/lang/ga/stop_words.py
 create mode 100644 spacy/lang/ga/tokenizer_exceptions.py

diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py
new file mode 100644
index 000000000..8231cc925
--- /dev/null
+++ b/spacy/lang/ga/__init__.py
@@ -0,0 +1,24 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .stop_words import STOP_WORDS
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...language import Language
+from ...attrs import LANG
+from ...util import update_exc
+
+
+class Irish(Language):
+    lang = 'nb'
+
+    class Defaults(Language.Defaults):
+        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+        lex_attr_getters[LANG] = lambda text: 'ga'
+
+        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+        stop_words = set(STOP_WORDS)
+
+
+__all__ = ['Irish']
diff --git a/spacy/lang/ga/stop_words.py b/spacy/lang/ga/stop_words.py
new file mode 100644
index 000000000..816c00b13
--- /dev/null
+++ b/spacy/lang/ga/stop_words.py
@@ -0,0 +1,45 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+STOP_WORDS = set("""
+a ach ag agus an aon ar arna as
+
+ba beirt bhúr
+
+caoga ceathair ceathrar chomh chuig chun cois céad cúig cúigear
+
+daichead dar de deich deichniúr den dhá do don dtí dá dár dó
+
+faoi faoin faoina faoinár fara fiche
+
+gach gan go gur
+
+haon hocht
+
+i iad idir in ina ins inár is
+
+le leis lena lenár
+
+mar mo muid mé
+
+na nach naoi naonúr ná ní níor nó nócha
+
+ocht ochtar ochtó os
+
+roimh
+
+sa seacht seachtar seachtó seasca seisear siad sibh sinn sna sé sí
+
+tar thar thú triúr trí trína trínár tríocha tú
+
+um
+
+ár
+
+é éis
+
+í
+
+ó ón óna ónár
+""".split())
diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py
new file mode 100644
index 000000000..ce280a3a2
--- /dev/null
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@@ -0,0 +1,115 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ..symbols import ORTH, LEMMA, NORM
+
+
+_exc = {
+    "'acha'n": [
+        {ORTH: "'ach", LEMMA: "gach", NORM: "gach"},
+        {ORTH: "a'n", LEMMA: "aon", NORM: "aon"}],
+
+    "dem'": [
+        {ORTH: "de", LEMMA: "de", NORM: "de"},
+        {ORTH: "m'", LEMMA: "mo", NORM: "mo"}],
+
+    "ded'": [
+        {ORTH: "de", LEMMA: "de", NORM: "de"},
+        {ORTH: "d'", LEMMA: "do", NORM: "do"}],
+
+    "lem'": [
+        {ORTH: "le", LEMMA: "le", NORM: "le"},
+        {ORTH: "m'", LEMMA: "mo", NORM: "mo"}],
+
+    "led'": [
+        {ORTH: "le", LEMMA: "le", NORM: "le"},
+        {ORTH: "d'", LEMMA: "mo", NORM: "do"}],
+
+    "a.C.n.": [
+        {ORTH: "a.", LEMMA: "ante"},
+        {ORTH: "C.", LEMMA: "Christum"},
+        {ORTH: "n.", LEMMA: "natum"}],
+
+    "m.sh.": [
+        {ORTH: "m.", LEMMA: "mar"},
+        {ORTH: "sh.", LEMMA: "sampla"}],
+
+    "M.F.": [
+        {ORTH: "M.", LEMMA: "Meán"},
+        {ORTH: "F.", LEMMA: "Fómhar"}],
+
+    "M.Fómh.": [
+        {ORTH: "M.", LEMMA: "Meán"},
+        {ORTH: "Fómh.", LEMMA: "Fómhar"}],
+
+    "R.C.": [
+        {ORTH: "Rr.", LEMMA: "roimh"},
+        {ORTH: "C.", LEMMA: "Críost"}],
+
+    "r.Ch.": [
+        {ORTH: "r.", LEMMA: "roimh"},
+        {ORTH: "Ch.", LEMMA: "Críost"}],
+
+    "r.Chr.": [
+        {ORTH: "r.", LEMMA: "roimh"},
+        {ORTH: "Chr.", LEMMA: "Críost"}],
+
+    "R.Ch.": [
+        {ORTH: "R.", LEMMA: "roimh"},
+        {ORTH: "Ch.", LEMMA: "Críost"}],
+
+    "R.Chr.": [
+        {ORTH: "R.", LEMMA: "roimh"},
+        {ORTH: "Chr.", LEMMA: "Críost"}],
+
+    "⁊rl.": [
+        {ORTH: "⁊", LEMMA: "agus"},
+        {ORTH: "rl.", LEMMA: "araile"}],
+
+    "srl.": [
+        {ORTH: "s", LEMMA: "agus"},
+        {ORTH: "rl.", LEMMA: "araile"}],
+
+}
+
+for exc_data in [
+    {ORTH: "'gus", LEMMA: "agus", NORM: "agus"},
+    {ORTH: "'ach", LEMMA: "gach", NORM: "gach"},
+    {ORTH: "ao'", LEMMA: "aon", NORM: "aon"},
+    {ORTH: "'niar", LEMMA: "aniar", NORM: "aniar"},
+    {ORTH: "'níos", LEMMA: "aníos", NORM: "aníos"},
+    {ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu"},
+    {ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht"},
+    {ORTH: "m'", LEMMA: "mo"},,
+    {ORTH: "Aib.", LEMMA: "Aibreán"},
+    {ORTH: "Ath.", LEMMA: "athair"},
+    {ORTH: "Beal.", LEMMA: "Bealtaine"},
+    {ORTH: "Co.", LEMMA: "contae"},
+    {ORTH: "Ean.", LEMMA: "Eanáir"},
+    {ORTH: "Feab.", LEMMA: "Feabhra"},
+    {ORTH: "gCo.", LEMMA: "contae"},
+    {ORTH: ".i.", LEMMA: "eadhon"},
+    {ORTH: "lch.", LEMMA: "leathanach"},
+    {ORTH: "Lch.", LEMMA: "leathanach"},
+    {ORTH: "lgh.", LEMMA: "leathanach"},
+    {ORTH: "Lgh.", LEMMA: "leathanach"},
+    {ORTH: "Lún.", LEMMA: "Lúnasa"},
+    {ORTH: "Már.", LEMMA: "Márta"},
+    {ORTH: "Meith.", LEMMA: "Meitheamh"},
+    {ORTH: "Noll.", LEMMA: "Nollaig"},
+    {ORTH: "Samh.", LEMMA: "Samhain"},
+    {ORTH: "tAth.", LEMMA: "athair"},
+    {ORTH: "tUas.", LEMMA: "Uasal"},
+    {ORTH: "teo.", LEMMA: "teoranta"},
+    {ORTH: "Teo.", LEMMA: "teoranta"},
+    {ORTH: "Uas.", LEMMA: "Uasal"},
+    {ORTH: "uimh.", LEMMA: "uimhir"},
+    {ORTH: "Uimh.", LEMMA: "uimhir"}]:
+    _exc[exc_data[ORTH]] = [dict(exc_data)],
+
+for orth in [
+    "d'"]:
+    _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = dict(_exc)

From e9213f54deece142fff6c4ff0a2ae4106288f417 Mon Sep 17 00:00:00 2001
From: Jim O'Regan <joregan@gmail.com>
Date: Mon, 26 Jun 2017 21:29:21 +0100
Subject: [PATCH 02/90] missed one

---
 spacy/lang/ga/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py
index 8231cc925..7b72a8a91 100644
--- a/spacy/lang/ga/__init__.py
+++ b/spacy/lang/ga/__init__.py
@@ -11,7 +11,7 @@ from ...util import update_exc
 
 
 class Irish(Language):
-    lang = 'nb'
+    lang = 'ga'
 
     class Defaults(Language.Defaults):
         lex_attr_getters = dict(Language.Defaults.lex_attr_getters)

From 3c4d83aa6e634b19889338bdf3c0dfd593f9fdc6 Mon Sep 17 00:00:00 2001
From: Jim O'Regan <joregan@gmail.com>
Date: Sat, 24 Jun 2017 22:29:02 +0100
Subject: [PATCH 03/90] CLA

---
 .github/contributors/jimregan.md | 106 +++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/jimregan.md

diff --git a/.github/contributors/jimregan.md b/.github/contributors/jimregan.md
new file mode 100644
index 000000000..dd8fe3d64
--- /dev/null
+++ b/.github/contributors/jimregan.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Jim O'Regan          |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2017-06-24           |
+| GitHub username                | jimregan             |
+| Website (optional)             |                      |

From a8dff9133e84671a3111390a8f4e8965ec744519 Mon Sep 17 00:00:00 2001
From: Jim O'Regan <joregan@gmail.com>
Date: Mon, 26 Jun 2017 21:53:41 +0100
Subject: [PATCH 04/90] add POS

---
 spacy/lang/ga/tokenizer_exceptions.py | 130 +++++++++++++-------------
 1 file changed, 65 insertions(+), 65 deletions(-)

diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py
index ce280a3a2..3dca1c3d7 100644
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@@ -1,110 +1,110 @@
 # encoding: utf8
 from __future__ import unicode_literals
 
-from ..symbols import ORTH, LEMMA, NORM
+from ..symbols import ORTH, LEMMA, NORM, POS
 
 
 _exc = {
     "'acha'n": [
-        {ORTH: "'ach", LEMMA: "gach", NORM: "gach"},
-        {ORTH: "a'n", LEMMA: "aon", NORM: "aon"}],
+        {ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET},
+        {ORTH: "a'n", LEMMA: "aon", NORM: "aon", POS: DET}],
 
     "dem'": [
-        {ORTH: "de", LEMMA: "de", NORM: "de"},
-        {ORTH: "m'", LEMMA: "mo", NORM: "mo"}],
+        {ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP},
+        {ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET}],
 
     "ded'": [
-        {ORTH: "de", LEMMA: "de", NORM: "de"},
-        {ORTH: "d'", LEMMA: "do", NORM: "do"}],
+        {ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP},
+        {ORTH: "d'", LEMMA: "do", NORM: "do", POS: DET}],
 
     "lem'": [
-        {ORTH: "le", LEMMA: "le", NORM: "le"},
-        {ORTH: "m'", LEMMA: "mo", NORM: "mo"}],
+        {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP},
+        {ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET}],
 
     "led'": [
-        {ORTH: "le", LEMMA: "le", NORM: "le"},
-        {ORTH: "d'", LEMMA: "mo", NORM: "do"}],
+        {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP},
+        {ORTH: "d'", LEMMA: "mo", NORM: "do", POS: DET}],
 
     "a.C.n.": [
-        {ORTH: "a.", LEMMA: "ante"},
-        {ORTH: "C.", LEMMA: "Christum"},
-        {ORTH: "n.", LEMMA: "natum"}],
+        {ORTH: "a.", LEMMA: "ante", POS: X},
+        {ORTH: "C.", LEMMA: "Christum", POS: X},
+        {ORTH: "n.", LEMMA: "natum", POS: X}],
 
     "m.sh.": [
-        {ORTH: "m.", LEMMA: "mar"},
-        {ORTH: "sh.", LEMMA: "sampla"}],
+        {ORTH: "m.", LEMMA: "mar", POS: ADP},
+        {ORTH: "sh.", LEMMA: "sampla", POS: NOUN}],
 
     "M.F.": [
-        {ORTH: "M.", LEMMA: "Meán"},
-        {ORTH: "F.", LEMMA: "Fómhar"}],
+        {ORTH: "M.", LEMMA: "Meán", POS: NOUN},
+        {ORTH: "F.", LEMMA: "Fómhar", POS: NOUN}],
 
     "M.Fómh.": [
-        {ORTH: "M.", LEMMA: "Meán"},
-        {ORTH: "Fómh.", LEMMA: "Fómhar"}],
+        {ORTH: "M.", LEMMA: "Meán", POS: NOUN},
+        {ORTH: "Fómh.", LEMMA: "Fómhar", POS: NOUN}],
 
     "R.C.": [
-        {ORTH: "Rr.", LEMMA: "roimh"},
-        {ORTH: "C.", LEMMA: "Críost"}],
+        {ORTH: "Rr.", LEMMA: "roimh", POS: ADP},
+        {ORTH: "C.", LEMMA: "Críost", POS: NOUN}],
 
     "r.Ch.": [
-        {ORTH: "r.", LEMMA: "roimh"},
-        {ORTH: "Ch.", LEMMA: "Críost"}],
+        {ORTH: "r.", LEMMA: "roimh", POS: ADP},
+        {ORTH: "Ch.", LEMMA: "Críost", POS: NOUN}],
 
     "r.Chr.": [
-        {ORTH: "r.", LEMMA: "roimh"},
-        {ORTH: "Chr.", LEMMA: "Críost"}],
+        {ORTH: "r.", LEMMA: "roimh", POS: ADP},
+        {ORTH: "Chr.", LEMMA: "Críost", POS: NOUN}],
 
     "R.Ch.": [
-        {ORTH: "R.", LEMMA: "roimh"},
-        {ORTH: "Ch.", LEMMA: "Críost"}],
+        {ORTH: "R.", LEMMA: "roimh", POS: ADP},
+        {ORTH: "Ch.", LEMMA: "Críost", POS: NOUN}],
 
     "R.Chr.": [
-        {ORTH: "R.", LEMMA: "roimh"},
-        {ORTH: "Chr.", LEMMA: "Críost"}],
+        {ORTH: "R.", LEMMA: "roimh", POS: ADP},
+        {ORTH: "Chr.", LEMMA: "Críost", POS: NOUN}],
 
     "⁊rl.": [
-        {ORTH: "⁊", LEMMA: "agus"},
-        {ORTH: "rl.", LEMMA: "araile"}],
+        {ORTH: "⁊", LEMMA: "agus", POS: CCONJ},
+        {ORTH: "rl.", LEMMA: "araile", POS: ADJ}],
 
     "srl.": [
-        {ORTH: "s", LEMMA: "agus"},
-        {ORTH: "rl.", LEMMA: "araile"}],
+        {ORTH: "s", LEMMA: "agus", POS: CCONJ},
+        {ORTH: "rl.", LEMMA: "araile", POS: ADJ}],
 
 }
 
 for exc_data in [
-    {ORTH: "'gus", LEMMA: "agus", NORM: "agus"},
-    {ORTH: "'ach", LEMMA: "gach", NORM: "gach"},
+    {ORTH: "'gus", LEMMA: "agus", NORM: "agus", POS: CCONJ},
+    {ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET},
     {ORTH: "ao'", LEMMA: "aon", NORM: "aon"},
-    {ORTH: "'niar", LEMMA: "aniar", NORM: "aniar"},
-    {ORTH: "'níos", LEMMA: "aníos", NORM: "aníos"},
-    {ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu"},
-    {ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht"},
-    {ORTH: "m'", LEMMA: "mo"},,
-    {ORTH: "Aib.", LEMMA: "Aibreán"},
-    {ORTH: "Ath.", LEMMA: "athair"},
-    {ORTH: "Beal.", LEMMA: "Bealtaine"},
-    {ORTH: "Co.", LEMMA: "contae"},
-    {ORTH: "Ean.", LEMMA: "Eanáir"},
-    {ORTH: "Feab.", LEMMA: "Feabhra"},
-    {ORTH: "gCo.", LEMMA: "contae"},
-    {ORTH: ".i.", LEMMA: "eadhon"},
-    {ORTH: "lch.", LEMMA: "leathanach"},
-    {ORTH: "Lch.", LEMMA: "leathanach"},
-    {ORTH: "lgh.", LEMMA: "leathanach"},
-    {ORTH: "Lgh.", LEMMA: "leathanach"},
-    {ORTH: "Lún.", LEMMA: "Lúnasa"},
-    {ORTH: "Már.", LEMMA: "Márta"},
-    {ORTH: "Meith.", LEMMA: "Meitheamh"},
-    {ORTH: "Noll.", LEMMA: "Nollaig"},
-    {ORTH: "Samh.", LEMMA: "Samhain"},
-    {ORTH: "tAth.", LEMMA: "athair"},
-    {ORTH: "tUas.", LEMMA: "Uasal"},
-    {ORTH: "teo.", LEMMA: "teoranta"},
-    {ORTH: "Teo.", LEMMA: "teoranta"},
-    {ORTH: "Uas.", LEMMA: "Uasal"},
-    {ORTH: "uimh.", LEMMA: "uimhir"},
-    {ORTH: "Uimh.", LEMMA: "uimhir"}]:
+    {ORTH: "'niar", LEMMA: "aniar", NORM: "aniar", POS: ADV},
+    {ORTH: "'níos", LEMMA: "aníos", NORM: "aníos", POS: ADV},
+    {ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu", POS: ADV},
+    {ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht", POS: ADV},
+    {ORTH: "m'", LEMMA: "mo", POS: DET},
+    {ORTH: "Aib.", LEMMA: "Aibreán", POS: NOUN},
+    {ORTH: "Ath.", LEMMA: "athair", POS: NOUN},
+    {ORTH: "Beal.", LEMMA: "Bealtaine", POS: NOUN},
+    {ORTH: "Co.", LEMMA: "contae", POS: NOUN},
+    {ORTH: "Ean.", LEMMA: "Eanáir", POS: NOUN},
+    {ORTH: "Feab.", LEMMA: "Feabhra", POS: NOUN},
+    {ORTH: "gCo.", LEMMA: "contae", POS: NOUN},
+    {ORTH: ".i.", LEMMA: "eadhon", POS: ADV},
+    {ORTH: "lch.", LEMMA: "leathanach", POS: NOUN},
+    {ORTH: "Lch.", LEMMA: "leathanach", POS: NOUN},
+    {ORTH: "lgh.", LEMMA: "leathanach", POS: NOUN},
+    {ORTH: "Lgh.", LEMMA: "leathanach", POS: NOUN},
+    {ORTH: "Lún.", LEMMA: "Lúnasa", POS: NOUN},
+    {ORTH: "Már.", LEMMA: "Márta", POS: NOUN},
+    {ORTH: "Meith.", LEMMA: "Meitheamh", POS: NOUN},
+    {ORTH: "Noll.", LEMMA: "Nollaig", POS: NOUN},
+    {ORTH: "Samh.", LEMMA: "Samhain", POS: NOUN},
+    {ORTH: "tAth.", LEMMA: "athair", POS: NOUN},
+    {ORTH: "tUas.", LEMMA: "Uasal", POS: NOUN},
+    {ORTH: "teo.", LEMMA: "teoranta", POS: NOUN},
+    {ORTH: "Teo.", LEMMA: "teoranta", POS: NOUN},
+    {ORTH: "Uas.", LEMMA: "Uasal", POS: NOUN},
+    {ORTH: "uimh.", LEMMA: "uimhir", POS: NOUN},
+    {ORTH: "Uimh.", LEMMA: "uimhir", POS: NOUN}]:
     _exc[exc_data[ORTH]] = [dict(exc_data)],
 
 for orth in [

From 5e5f94c1c0939da81dc939ed10c639f50557522c Mon Sep 17 00:00:00 2001
From: Jim O'Regan <joregan@gmail.com>
Date: Mon, 26 Jun 2017 21:57:00 +0100
Subject: [PATCH 05/90] fix dup

---
 spacy/lang/ga/tokenizer_exceptions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py
index 3dca1c3d7..fad51a2fb 100644
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@@ -43,7 +43,7 @@ _exc = {
         {ORTH: "Fómh.", LEMMA: "Fómhar", POS: NOUN}],
 
     "R.C.": [
-        {ORTH: "Rr.", LEMMA: "roimh", POS: ADP},
+        {ORTH: "R.", LEMMA: "roimh", POS: ADP},
         {ORTH: "C.", LEMMA: "Críost", POS: NOUN}],
 
     "r.Ch.": [

From c1e4e0f3bf355eb7771759fcab58229f630e98e5 Mon Sep 17 00:00:00 2001
From: Jim O'Regan <joregan@gmail.com>
Date: Mon, 26 Jun 2017 22:19:39 +0100
Subject: [PATCH 06/90] just now discovered that you can do multiwords

---
 spacy/lang/ga/tokenizer_exceptions.py | 56 ++++++---------------------
 1 file changed, 11 insertions(+), 45 deletions(-)

diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py
index fad51a2fb..afd901e33 100644
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@@ -25,51 +25,6 @@ _exc = {
         {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP},
         {ORTH: "d'", LEMMA: "mo", NORM: "do", POS: DET}],
 
-    "a.C.n.": [
-        {ORTH: "a.", LEMMA: "ante", POS: X},
-        {ORTH: "C.", LEMMA: "Christum", POS: X},
-        {ORTH: "n.", LEMMA: "natum", POS: X}],
-
-    "m.sh.": [
-        {ORTH: "m.", LEMMA: "mar", POS: ADP},
-        {ORTH: "sh.", LEMMA: "sampla", POS: NOUN}],
-
-    "M.F.": [
-        {ORTH: "M.", LEMMA: "Meán", POS: NOUN},
-        {ORTH: "F.", LEMMA: "Fómhar", POS: NOUN}],
-
-    "M.Fómh.": [
-        {ORTH: "M.", LEMMA: "Meán", POS: NOUN},
-        {ORTH: "Fómh.", LEMMA: "Fómhar", POS: NOUN}],
-
-    "R.C.": [
-        {ORTH: "R.", LEMMA: "roimh", POS: ADP},
-        {ORTH: "C.", LEMMA: "Críost", POS: NOUN}],
-
-    "r.Ch.": [
-        {ORTH: "r.", LEMMA: "roimh", POS: ADP},
-        {ORTH: "Ch.", LEMMA: "Críost", POS: NOUN}],
-
-    "r.Chr.": [
-        {ORTH: "r.", LEMMA: "roimh", POS: ADP},
-        {ORTH: "Chr.", LEMMA: "Críost", POS: NOUN}],
-
-    "R.Ch.": [
-        {ORTH: "R.", LEMMA: "roimh", POS: ADP},
-        {ORTH: "Ch.", LEMMA: "Críost", POS: NOUN}],
-
-    "R.Chr.": [
-        {ORTH: "R.", LEMMA: "roimh", POS: ADP},
-        {ORTH: "Chr.", LEMMA: "Críost", POS: NOUN}],
-
-    "⁊rl.": [
-        {ORTH: "⁊", LEMMA: "agus", POS: CCONJ},
-        {ORTH: "rl.", LEMMA: "araile", POS: ADJ}],
-
-    "srl.": [
-        {ORTH: "s", LEMMA: "agus", POS: CCONJ},
-        {ORTH: "rl.", LEMMA: "araile", POS: ADJ}],
-
 }
 
 for exc_data in [
@@ -84,6 +39,17 @@ for exc_data in [
     {ORTH: "Aib.", LEMMA: "Aibreán", POS: NOUN},
     {ORTH: "Ath.", LEMMA: "athair", POS: NOUN},
     {ORTH: "Beal.", LEMMA: "Bealtaine", POS: NOUN},
+    {ORTH: "a.C.n.", LEMMA: "ante Christum natum", POS: X},
+    {ORTH: "m.sh.", LEMMA: "mar shampla", POS: ADV},
+    {ORTH: "M.F.", LEMMA: "Meán Fómhair", POS: NOUN},
+    {ORTH: "M.Fómh.", LEMMA: "Meán Fómhair", POS: NOUN},
+    {ORTH: "D.F.", LEMMA: "Deireadh Fómhair", POS: NOUN},
+    {ORTH: "D.Fómh.", LEMMA: "Deireadh Fómhair", POS: NOUN},
+    {ORTH: "R.C.", LEMMA: "roimh Chríost", POS: ADV},
+    {ORTH: "r.Ch.", LEMMA: "roimh Chríost", POS: ADV},
+    {ORTH: "r.Chr.", LEMMA: "roimh Chríost", POS: ADV},
+    {ORTH: "⁊rl.", LEMMA: "agus araile", POS: ADV},
+    {ORTH: "srl.", LEMMA: "agus araile", POS: ADV},
     {ORTH: "Co.", LEMMA: "contae", POS: NOUN},
     {ORTH: "Ean.", LEMMA: "Eanáir", POS: NOUN},
     {ORTH: "Feab.", LEMMA: "Feabhra", POS: NOUN},

From e12defdd9ca8748f553f3360ffc0242ee234fd25 Mon Sep 17 00:00:00 2001
From: Jim O'Regan <joregan@gmail.com>
Date: Mon, 26 Jun 2017 22:24:14 +0100
Subject: [PATCH 07/90] missed a couple

---
 spacy/lang/ga/tokenizer_exceptions.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py
index afd901e33..2f6d1ebdf 100644
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@@ -45,9 +45,12 @@ for exc_data in [
     {ORTH: "M.Fómh.", LEMMA: "Meán Fómhair", POS: NOUN},
     {ORTH: "D.F.", LEMMA: "Deireadh Fómhair", POS: NOUN},
     {ORTH: "D.Fómh.", LEMMA: "Deireadh Fómhair", POS: NOUN},
+    {ORTH: "r.C.", LEMMA: "roimh Chríost", POS: ADV},
     {ORTH: "R.C.", LEMMA: "roimh Chríost", POS: ADV},
     {ORTH: "r.Ch.", LEMMA: "roimh Chríost", POS: ADV},
     {ORTH: "r.Chr.", LEMMA: "roimh Chríost", POS: ADV},
+    {ORTH: "R.Ch.", LEMMA: "roimh Chríost", POS: ADV},
+    {ORTH: "R.Chr.", LEMMA: "roimh Chríost", POS: ADV},
     {ORTH: "⁊rl.", LEMMA: "agus araile", POS: ADV},
     {ORTH: "srl.", LEMMA: "agus araile", POS: ADV},
     {ORTH: "Co.", LEMMA: "contae", POS: NOUN},

From 559e03605a52d2c68ba5e565ff69d0a09690f4f5 Mon Sep 17 00:00:00 2001
From: Jim O'Regan <joregan@gmail.com>
Date: Tue, 27 Jun 2017 22:42:16 +0100
Subject: [PATCH 08/90] b'

---
 spacy/lang/ga/tokenizer_exceptions.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py
index 2f6d1ebdf..7d29f4bcc 100644
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@@ -58,6 +58,8 @@ for exc_data in [
     {ORTH: "Feab.", LEMMA: "Feabhra", POS: NOUN},
     {ORTH: "gCo.", LEMMA: "contae", POS: NOUN},
     {ORTH: ".i.", LEMMA: "eadhon", POS: ADV},
+    {ORTH: "B'", LEMMA: "ba", POS: AUX},
+    {ORTH: "b'", LEMMA: "ba", POS: AUX},
     {ORTH: "lch.", LEMMA: "leathanach", POS: NOUN},
     {ORTH: "Lch.", LEMMA: "leathanach", POS: NOUN},
     {ORTH: "lgh.", LEMMA: "leathanach", POS: NOUN},
@@ -77,7 +79,7 @@ for exc_data in [
     _exc[exc_data[ORTH]] = [dict(exc_data)],
 
 for orth in [
-    "d'"]:
+    "d'", "D'"]:
     _exc[orth] = [{ORTH: orth}]
 
 

From 1ba38b2036e69ea0ff400e14e217d887f09f7165 Mon Sep 17 00:00:00 2001
From: Jim O'Regan <joregan@gmail.com>
Date: Wed, 28 Jun 2017 00:42:00 +0100
Subject: [PATCH 09/90] some helpers; the Irish part of UD only has 2500
 sentences so this will need source of morphology

---
 spacy/lang/ga/irish_morphology_helpers.py | 33 +++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 spacy/lang/ga/irish_morphology_helpers.py

diff --git a/spacy/lang/ga/irish_morphology_helpers.py b/spacy/lang/ga/irish_morphology_helpers.py
new file mode 100644
index 000000000..2b008f295
--- /dev/null
+++ b/spacy/lang/ga/irish_morphology_helpers.py
@@ -0,0 +1,33 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+class IrishMorph:
+    consonants = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z']
+    broad_vowels = ['a', 'á', 'o', 'ó', 'u', 'ú']
+    slender_vowels = ['e', 'é', 'i', 'í']
+    vowels = broad_vowels + slender_vowels
+
+    def ends_dentals(word):
+        if word[-1:] in ['d', 'n', 't', 's']:
+            return True
+        else:
+            return False
+
+    def devoice(word):
+        if word[-2] == 's' and word[-1] == 'd':
+            return word[:-1] + 't'
+        else:
+            return word
+
+    def ends_with_vowel(word):
+        return word[-1] in vowels
+
+    def starts_with_vowel(word):
+        return word[0] in vowels
+
+    def deduplicate(word):
+        if word[-2] == word[-1] and word[-1] in consonants:
+            return word[:-1]
+        else:
+            return word
+

From 70f4d26c108dbb9b2dcfbf4a1c90d9fdfcea2a7d Mon Sep 17 00:00:00 2001
From: Jim O'Regan <joregan@gmail.com>
Date: Wed, 28 Jun 2017 10:59:46 +0100
Subject: [PATCH 10/90] bounds checks

---
 spacy/lang/ga/irish_morphology_helpers.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/spacy/lang/ga/irish_morphology_helpers.py b/spacy/lang/ga/irish_morphology_helpers.py
index 2b008f295..383e24efc 100644
--- a/spacy/lang/ga/irish_morphology_helpers.py
+++ b/spacy/lang/ga/irish_morphology_helpers.py
@@ -8,25 +8,25 @@ class IrishMorph:
     vowels = broad_vowels + slender_vowels
 
     def ends_dentals(word):
-        if word[-1:] in ['d', 'n', 't', 's']:
+        if word != "" and word[-1] in ['d', 'n', 't', 's']:
             return True
         else:
             return False
 
     def devoice(word):
-        if word[-2] == 's' and word[-1] == 'd':
+        if len(word) > 2 and word[-2] == 's' and word[-1] == 'd':
             return word[:-1] + 't'
         else:
             return word
 
     def ends_with_vowel(word):
-        return word[-1] in vowels
+        return word != "" and word[-1] in vowels
 
     def starts_with_vowel(word):
-        return word[0] in vowels
+        return word != "" and word[0] in vowels
 
     def deduplicate(word):
-        if word[-2] == word[-1] and word[-1] in consonants:
+        if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants:
             return word[:-1]
         else:
             return word

From 76c22dec4dba150fd848072472d0e4bb65fc4a65 Mon Sep 17 00:00:00 2001
From: Jim O'Regan <jaoregan@tcd.ie>
Date: Tue, 8 Aug 2017 19:04:52 +0100
Subject: [PATCH 11/90] UD Irish tag mapping

---
 spacy/lang/ga/tag_map.py | 366 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 366 insertions(+)
 create mode 100644 spacy/lang/ga/tag_map.py

diff --git a/spacy/lang/ga/tag_map.py b/spacy/lang/ga/tag_map.py
new file mode 100644
index 000000000..598d368bb
--- /dev/null
+++ b/spacy/lang/ga/tag_map.py
@@ -0,0 +1,366 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+TAG_MAP = {
+    "ADJ__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}},
+    "ADJ__Case=Gen|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "fem", "Number": "sing"},
+    "ADJ__Case=Gen|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing"},
+    "ADJ__Case=Gen|NounType=Strong|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "Other": {"NounType": "strong"}},
+    "ADJ__Case=Gen|NounType=Weak|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "Other": {"NounType": "weak"}},
+    "ADJ__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}},
+    "ADJ__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}},
+    "ADJ__Case=NomAcc|Gender=Fem|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "plur"},
+    "ADJ__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "sing"},
+    "ADJ__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "plur"},
+    "ADJ__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "sing"},
+    "ADJ__Case=NomAcc|NounType=NotSlender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "Other": {"NounType": "notslender"}},
+    "ADJ__Case=NomAcc|NounType=Slender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "Other": {"NounType": "slender"}},
+    "ADJ__Degree=Cmp,Sup|Form=Len": {"pos": "ADJ", "Degree": "cmp|sup", "Other": {"Form": "len"}},
+    "ADJ__Degree=Cmp,Sup": {"pos": "ADJ", "Degree": "cmp|sup"},
+    "ADJ__Degree=Pos|Form=Ecl": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "ecl"}},
+    "ADJ__Degree=Pos|Form=HPref": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "hpref"}},
+    "ADJ__Degree=Pos|Form=Len": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "len"}},
+    "ADJ__Degree=Pos": {"pos": "ADJ", "Degree": "pos"},
+    "ADJ__Foreign=Yes": {"pos": "ADJ", "Foreign": "yes"},
+    "ADJ__Form=Len|VerbForm=Part": {"pos": "ADJ", "VerbForm": "part", "Other": {"Form": "len"}},
+    "ADJ__Gender=Masc|Number=Sing|PartType=Voc": {"pos": "ADJ", "Gender": "masc", "Number": "sing", "Case": "voc"},
+    "ADJ__Number=Plur|PartType=Voc": {"pos": "ADJ", "Number": "plur", "Case": "voc"},
+    "ADJ__Number=Plur": {"pos": "ADJ", "Number": "plur"},
+    "ADJ___": {"pos": "ADJ"},
+    "ADJ__VerbForm=Part": {"pos": "ADJ", "VerbForm": "part"},
+    "ADP__Foreign=Yes": {"pos": "ADP", "Foreign": "yes"},
+    "ADP__Form=Len|Number=Plur|Person=1": {"pos": "ADP", "Number": "plur", "Person": 1, "Other": {"Form": "len"}},
+    "ADP__Form=Len|Number=Plur|Person=3": {"pos": "ADP", "Number": "plur", "Person": 3, "Other": {"Form": "len"}},
+    "ADP__Form=Len|Number=Sing|Person=1": {"pos": "ADP", "Number": "sing", "Person": 1, "Other": {"Form": "len"}},
+    "ADP__Gender=Fem|Number=Sing|Person=3": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3},
+    "ADP__Gender=Fem|Number=Sing|Person=3|Poss=Yes": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes"},
+    "ADP__Gender=Fem|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes", "PronType": "prs"},
+    "ADP__Gender=Masc|Number=Sing|Person=3": {"pos": "ADP", "Gender": "masc", "Number": "sing", "Person": 3},
+    "ADP__Gender=Masc|Number=Sing|Person=3|Poss=Yes": {"pos": "ADP", "Gender": "masc", "Number": "sing", "Person": 3, "Poss": "yes"},
+    "ADP__Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {"pos": "ADP", "Gender": "masc", "Number": "sing", "Person": 3, "Poss": "yes", "PronType": "prs"},
+    "ADP__Gender=Masc|Number=Sing|Person=3|PronType=Emp": {"pos": "ADP", "Gender": "masc", "Number": "sing", "Person": 3, "PronType": "emp"},
+    "ADP__Number=Plur|Person=1": {"pos": "ADP", "Number": "plur", "Person": 1},
+    "ADP__Number=Plur|Person=1|Poss=Yes": {"pos": "ADP", "Number": "plur", "Person": 1, "Poss": "yes"},
+    "ADP__Number=Plur|Person=1|PronType=Emp": {"pos": "ADP", "Number": "plur", "Person": 1, "PronType": "emp"},
+    "ADP__Number=Plur|Person=2": {"pos": "ADP", "Number": "plur", "Person": 2},
+    "ADP__Number=Plur|Person=3": {"pos": "ADP", "Number": "plur", "Person": 3},
+    "ADP__Number=Plur|Person=3|Poss=Yes": {"pos": "ADP", "Number": "plur", "Person": 3, "Poss": "yes"},
+    "ADP__Number=Plur|Person=3|Poss=Yes|PronType=Prs": {"pos": "ADP", "Number": "plur", "Person": 3, "Poss": "yes", "PronType": "prs"},
+    "ADP__Number=Plur|Person=3|PronType=Emp": {"pos": "ADP", "Number": "plur", "Person": 3, "PronType": "emp"},
+    "ADP__Number=Plur|PronType=Art": {"pos": "ADP", "Number": "plur", "PronType": "art"},
+    "ADP__Number=Sing|Person=1": {"pos": "ADP", "Number": "sing", "Person": 1},
+    "ADP__Number=Sing|Person=1|Poss=Yes": {"pos": "ADP", "Number": "sing", "Person": 1, "Poss": "yes"},
+    "ADP__Number=Sing|Person=1|PronType=Emp": {"pos": "ADP", "Number": "sing", "Person": 1, "PronType": "emp"},
+    "ADP__Number=Sing|Person=2": {"pos": "ADP", "Number": "sing", "Person": 2},
+    "ADP__Number=Sing|Person=3": {"pos": "ADP", "Number": "sing", "Person": 3},
+    "ADP__Number=Sing|PronType=Art": {"pos": "ADP", "Number": "sing", "PronType": "art"},
+    "ADP__Person=3|Poss=Yes": {"pos": "ADP", "Person": 3, "Poss": "yes"},
+    "ADP___": {"pos": "ADP"},
+    "ADP__Poss=Yes": {"pos": "ADP", "Poss": "yes"},
+    "ADP__PrepForm=Cmpd": {"pos": "ADP", "Other": {"PrepForm": "cmpd"}},
+    "ADP__PronType=Art": {"pos": "ADP", "PronType": "art"},
+    "ADV__Form=Len": {"pos": "ADV", "Other": {"Form": "len"}},
+    "ADV___": {"pos": "ADV"},
+    "ADV__PronType=Int": {"pos": "ADV", "PronType": "int"},
+    "AUX__Form=VF|Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}},
+    "AUX__Form=VF|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}},
+    "AUX__Form=VF|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}},
+    "AUX__Form=VF|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}},
+    "AUX__Form=VF|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"Form": "vf", "VerbForm": "cop"}},
+    "AUX__Gender=Masc|Number=Sing|Person=3|VerbForm=Cop": {"pos": "AUX", "Gender": "masc", "Number": "sing", "Person": 3, "Other": {"VerbForm": "cop"}},
+    "AUX__Mood=Int|Number=Sing|PronType=Art|VerbForm=Cop": {"pos": "AUX", "Number": "sing", "PronType": "art", "Other": {"Mood": "int", "VerbForm": "cop"}},
+    "AUX__Mood=Int|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"Mood": "int", "VerbForm": "cop"}},
+    "AUX__Mood=Int|Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "Other": {"Mood": "int", "VerbForm": "cop"}},
+    "AUX__Mood=Int|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"Mood": "int", "VerbForm": "cop"}},
+    "AUX__PartType=Comp|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"PartType": "comp", "VerbForm": "cop"}},
+    "AUX__Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"VerbForm": "cop"}},
+    "AUX__Polarity=Neg|PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "pres", "Other": {"VerbForm": "cop"}},
+    "AUX__Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"VerbForm": "cop"}},
+    "AUX__Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "Other": {"VerbForm": "cop"}},
+    "AUX___": {"pos": "AUX"},
+    "AUX__PronType=Dem|VerbForm=Cop": {"pos": "AUX", "PronType": "dem", "Other": {"VerbForm": "cop"}},
+    "AUX__PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "Other": {"VerbForm": "cop"}},
+    "AUX__PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "pres", "Other": {"VerbForm": "cop"}},
+    "AUX__Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"VerbForm": "cop"}},
+    "AUX__Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"VerbForm": "cop"}},
+    "AUX__VerbForm=Cop": {"pos": "AUX", "Other": {"VerbForm": "cop"}},
+    "CCONJ___": {"pos": "CCONJ"},
+    "DET__Case=Gen|Definite=Def|Gender=Fem|Number=Sing|PronType=Art": {"pos": "DET", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing", "PronType": "art"},
+    "DET__Definite=Def|Form=Ecl": {"pos": "DET", "Definite": "def", "Other": {"Form": "ecl"}},
+    "DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art": {"pos": "DET", "Definite": "def", "Gender": "fem", "Number": "sing", "PronType": "art"},
+    "DET__Definite=Def|Number=Plur|PronType=Art": {"pos": "DET", "Definite": "def", "Number": "plur", "PronType": "art"},
+    "DET__Definite=Def|Number=Sing|PronType=Art": {"pos": "DET", "Definite": "def", "Number": "sing", "PronType": "art"},
+    "DET__Definite=Def": {"pos": "DET", "Definite": "def"},
+    "DET__Form=HPref|PronType=Ind": {"pos": "DET", "PronType": "ind", "Other": {"Form": "hpref"}},
+    "DET__Gender=Fem|Number=Sing|Person=3|Poss=Yes": {"pos": "DET", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes"},
+    "DET__Gender=Masc|Number=Sing|Person=3|Poss=Yes": {"pos": "DET", "Gender": "masc", "Number": "sing", "Person": 3, "Poss": "yes"},
+    "DET__Number=Plur|Person=1|Poss=Yes": {"pos": "DET", "Number": "plur", "Person": 1, "Poss": "yes"},
+    "DET__Number=Plur|Person=3|Poss=Yes": {"pos": "DET", "Number": "plur", "Person": 3, "Poss": "yes"},
+    "DET__Number=Sing|Person=1|Poss=Yes": {"pos": "DET", "Number": "sing", "Person": 1, "Poss": "yes"},
+    "DET__Number=Sing|Person=2|Poss=Yes": {"pos": "DET", "Number": "sing", "Person": 2, "Poss": "yes"},
+    "DET__Number=Sing|PronType=Int": {"pos": "DET", "Number": "sing", "PronType": "int"},
+    "DET___": {"pos": "DET"},
+    "DET__PronType=Dem": {"pos": "DET", "PronType": "dem"},
+    "DET__PronType=Ind": {"pos": "DET", "PronType": "ind"},
+    "NOUN__Case=Dat|Definite=Ind|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Definite": "ind", "Gender": "fem", "Number": "sing"},
+    "NOUN__Case=Dat|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}},
+    "NOUN__Case=Dat|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}},
+    "NOUN__Case=Dat|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing"},
+    "NOUN__Case=Dat|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "masc", "Number": "sing"},
+    "NOUN__Case=Gen|Definite=Def|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "plur", "Other": {"NounType": "strong"}},
+    "NOUN__Case=Gen|Definite=Def|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing"},
+    "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "Other": {"NounType": "strong"}},
+    "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}},
+    "NOUN__Case=Gen|Definite=Def|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "sing"},
+    "NOUN__Case=Gen|Definite=Ind|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "ind", "Gender": "fem", "Number": "sing"},
+    "NOUN__Case=Gen|Form=Ecl|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl", "NounType": "strong"}},
+    "NOUN__Case=Gen|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}},
+    "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl", "NounType": "strong"}},
+    "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl", "NounType": "weak"}},
+    "NOUN__Case=Gen|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}},
+    "NOUN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}},
+    "NOUN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}},
+    "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "len", "NounType": "strong"}},
+    "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "len", "NounType": "weak"}},
+    "NOUN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}},
+    "NOUN__Case=Gen|Form=Len|VerbForm=Inf": {"pos": "NOUN", "Case": "gen", "VerbForm": "inf", "Other": {"Form": "len"}},
+    "NOUN__Case=Gen|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"NounType": "strong"}},
+    "NOUN__Case=Gen|Gender=Fem|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"NounType": "weak"}},
+    "NOUN__Case=Gen|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur"},
+    "NOUN__Case=Gen|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing"},
+    "NOUN__Case=Gen|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "strong"}},
+    "NOUN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}},
+    "NOUN__Case=Gen|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur"},
+    "NOUN__Case=Gen|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing"},
+    "NOUN__Case=Gen|Number=Sing": {"pos": "NOUN", "Case": "gen", "Number": "sing"},
+    "NOUN__Case=Gen|VerbForm=Inf": {"pos": "NOUN", "Case": "gen", "VerbForm": "inf"},
+    "NOUN__Case=NomAcc|Definite=Def|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "fem", "Number": "plur"},
+    "NOUN__Case=NomAcc|Definite=Def|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "fem", "Number": "sing"},
+    "NOUN__Case=NomAcc|Definite=Def|Gender=Fem": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "fem"},
+    "NOUN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "plur"},
+    "NOUN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "sing"},
+    "NOUN__Case=NomAcc|Definite=Ind|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "ind", "Gender": "masc", "Number": "plur"},
+    "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl"}},
+    "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}},
+    "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl"}},
+    "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}},
+    "NOUN__Case=NomAcc|Form=Emp|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "emp"}},
+    "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "hpref"}},
+    "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}},
+    "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "hpref"}},
+    "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "hpref"}},
+    "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "len"}},
+    "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}},
+    "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "len"}},
+    "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}},
+    "NOUN__Case=NomAcc|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur"},
+    "NOUN__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing"},
+    "NOUN__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur"},
+    "NOUN__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing"},
+    "NOUN__Case=Voc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "voc", "Definite": "def", "Gender": "masc", "Number": "plur"},
+    "NOUN__Case=Voc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}},
+    "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "plur", "Other": {"Form": "len"}},
+    "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}},
+    "NOUN__Case=Voc|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "sing"},
+    "NOUN__Degree=Pos": {"pos": "NOUN", "Degree": "pos"},
+    "NOUN__Foreign=Yes": {"pos": "NOUN", "Foreign": "yes"},
+    "NOUN__Form=Ecl|Number=Sing": {"pos": "NOUN", "Number": "sing", "Other": {"Form": "ecl"}},
+    "NOUN__Form=Ecl|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "ecl"}},
+    "NOUN__Form=Ecl|VerbForm=Vnoun": {"pos": "NOUN", "VerbForm": "vnoun", "Other": {"Form": "ecl"}},
+    "NOUN__Form=HPref|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "hpref"}},
+    "NOUN__Form=Len|Number=Sing": {"pos": "NOUN", "Number": "sing", "Other": {"Form": "len"}},
+    "NOUN__Form=Len|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "len"}},
+    "NOUN__Gender=Fem|Number=Sing": {"pos": "NOUN", "Gender": "fem", "Number": "sing"},
+    "NOUN__Number=Sing|PartType=Comp": {"pos": "NOUN", "Number": "sing", "Other": {"PartType": "comp"}},
+    "NOUN__Number=Sing": {"pos": "NOUN", "Number": "sing"},
+    "NOUN___": {"pos": "NOUN"},
+    "NOUN__Reflex=Yes": {"pos": "NOUN", "Reflex": "yes"},
+    "NOUN__VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf"},
+    "NOUN__VerbForm=Vnoun": {"pos": "NOUN", "VerbForm": "vnoun"},
+    "NUM__Definite=Def|NumType=Card": {"pos": "NUM", "Definite": "def", "NumType": "card"},
+    "NUM__Form=Ecl|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "ecl"}},
+    "NUM__Form=Ecl|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Other": {"Form": "ecl"}},
+    "NUM__Form=HPref|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "hpref"}},
+    "NUM__Form=Len|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "len"}},
+    "NUM__Form=Len|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Other": {"Form": "len"}},
+    "NUM__NumType=Card": {"pos": "NUM", "NumType": "card"},
+    "NUM__NumType=Ord": {"pos": "NUM", "NumType": "ord"},
+    "NUM___": {"pos": "NUM"},
+    "PART__Form=Ecl|PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"Form": "ecl", "PartType": "vb"}},
+    "PART__Mood=Imp|PartType=Vb|Polarity=Neg": {"pos": "PART", "Mood": "imp", "Polarity": "neg", "Other": {"PartType": "vb"}},
+    "PART__Mood=Imp|PartType=Vb": {"pos": "PART", "Mood": "imp", "Other": {"PartType": "vb"}},
+    "PART__Mood=Int|PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"Mood": "int", "PartType": "vb"}},
+    "PART__PartType=Ad": {"pos": "PART", "Other": {"PartType": "ad"}},
+    "PART__PartType=Cmpl|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"PartType": "cmpl"}},
+    "PART__PartType=Cmpl|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "Other": {"PartType": "cmpl"}},
+    "PART__PartType=Cmpl": {"pos": "PART", "Other": {"PartType": "cmpl"}},
+    "PART__PartType=Comp": {"pos": "PART", "Other": {"PartType": "comp"}},
+    "PART__PartType=Cop|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"PartType": "cop"}},
+    "PART__PartType=Deg": {"pos": "PART", "Other": {"PartType": "deg"}},
+    "PART__PartType=Inf": {"pos": "PART", "PartType": "inf"},
+    "PART__PartType=Num": {"pos": "PART", "Other": {"PartType": "num"}},
+    "PART__PartType=Pat": {"pos": "PART", "Other": {"PartType": "pat"}},
+    "PART__PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"PartType": "vb"}},
+    "PART__PartType=Vb|Polarity=Neg|PronType=Rel": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "Other": {"PartType": "vb"}},
+    "PART__PartType=Vb|Polarity=Neg|PronType=Rel|Tense=Past": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"PartType": "vb"}},
+    "PART__PartType=Vb|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "Other": {"PartType": "vb"}},
+    "PART__PartType=Vb": {"pos": "PART", "Other": {"PartType": "vb"}},
+    "PART__PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"PartType": "vb"}},
+    "PART__PartType=Vb|PronType=Rel|Tense=Past": {"pos": "PART", "PronType": "rel", "Tense": "past", "Other": {"PartType": "vb"}},
+    "PART__PartType=Vb|Tense=Past": {"pos": "PART", "Tense": "past", "Other": {"PartType": "vb"}},
+    "PART__PartType=Voc": {"pos": "PART", "Other": {"PartType": "voc"}},
+    "PART___": {"pos": "PART"},
+    "PART__PronType=Rel": {"pos": "PART", "PronType": "rel"},
+    "PRON__Form=Len|Number=Sing|Person=2": {"pos": "PRON", "Number": "sing", "Person": 2, "Other": {"Form": "len"}},
+    "PRON__Form=Len|PronType=Ind": {"pos": "PRON", "PronType": "ind", "Other": {"Form": "len"}},
+    "PRON__Gender=Fem|Number=Sing|Person=3": {"pos": "PRON", "Gender": "fem", "Number": "sing", "Person": 3},
+    "PRON__Gender=Masc|Number=Sing|Person=3": {"pos": "PRON", "Gender": "masc", "Number": "sing", "Person": 3},
+    "PRON__Gender=Masc|Number=Sing|Person=3|PronType=Emp": {"pos": "PRON", "Gender": "masc", "Number": "sing", "Person": 3, "PronType": "emp"},
+    "PRON__Gender=Masc|Person=3": {"pos": "PRON", "Gender": "masc", "Person": 3},
+    "PRON__Number=Plur|Person=1": {"pos": "PRON", "Number": "plur", "Person": 1},
+    "PRON__Number=Plur|Person=1|PronType=Emp": {"pos": "PRON", "Number": "plur", "Person": 1, "PronType": "emp"},
+    "PRON__Number=Plur|Person=2": {"pos": "PRON", "Number": "plur", "Person": 2},
+    "PRON__Number=Plur|Person=3": {"pos": "PRON", "Number": "plur", "Person": 3},
+    "PRON__Number=Plur|Person=3|PronType=Emp": {"pos": "PRON", "Number": "plur", "Person": 3, "PronType": "emp"},
+    "PRON__Number=Sing|Person=1": {"pos": "PRON", "Number": "sing", "Person": 1},
+    "PRON__Number=Sing|Person=1|PronType=Emp": {"pos": "PRON", "Number": "sing", "Person": 1, "PronType": "emp"},
+    "PRON__Number=Sing|Person=2": {"pos": "PRON", "Number": "sing", "Person": 2},
+    "PRON__Number=Sing|Person=2|PronType=Emp": {"pos": "PRON", "Number": "sing", "Person": 2, "PronType": "emp"},
+    "PRON__Number=Sing|Person=3": {"pos": "PRON", "Number": "sing", "Person": 3},
+    "PRON__Number=Sing|PronType=Int": {"pos": "PRON", "Number": "sing", "PronType": "int"},
+    "PRON__PronType=Dem": {"pos": "PRON", "PronType": "dem"},
+    "PRON__PronType=Ind": {"pos": "PRON", "PronType": "ind"},
+    "PRON__PronType=Int": {"pos": "PRON", "PronType": "int"},
+    "PRON__Reflex=Yes": {"pos": "PRON", "Reflex": "yes"},
+    "PROPN__Abbr=Yes": {"pos": "PROPN", "Other": {"Abbr": "yes"}},
+    "PROPN__Case=Dat|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "dat", "Gender": "fem", "Number": "sing"},
+    "PROPN__Case=Gen|Definite=Def|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing"},
+    "PROPN__Case=Gen|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl"}},
+    "PROPN__Case=Gen|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl"}},
+    "PROPN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}},
+    "PROPN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}},
+    "PROPN__Case=Gen|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Other": {"Form": "len"}},
+    "PROPN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}},
+    "PROPN__Case=Gen|Form=Len|Gender=Masc": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Other": {"Form": "len"}},
+    "PROPN__Case=Gen|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing"},
+    "PROPN__Case=Gen|Gender=Fem": {"pos": "PROPN", "Case": "gen", "Gender": "fem"},
+    "PROPN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}},
+    "PROPN__Case=Gen|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "sing"},
+    "PROPN__Case=Gen|Gender=Masc": {"pos": "PROPN", "Case": "gen", "Gender": "masc"},
+    "PROPN__Case=NomAcc|Definite=Def|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "fem", "Number": "sing"},
+    "PROPN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "plur"},
+    "PROPN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "sing"},
+    "PROPN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}},
+    "PROPN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}},
+    "PROPN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "hpref"}},
+    "PROPN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}},
+    "PROPN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}},
+    "PROPN__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing"},
+    "PROPN__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "plur"},
+    "PROPN__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing"},
+    "PROPN__Case=NomAcc|Gender=Masc": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc"},
+    "PROPN__Case=Voc|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "voc", "Gender": "fem", "Other": {"Form": "len"}},
+    "PROPN__Case=Voc|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "voc", "Gender": "masc", "Number": "sing"},
+    "PROPN__Gender=Masc|Number=Sing": {"pos": "PROPN", "Gender": "masc", "Number": "sing"},
+    "PROPN___": {"pos": "PROPN"},
+    "PUNCT___": {"pos": "PUNCT"},
+    "SCONJ___": {"pos": "SCONJ"},
+    "SCONJ__Tense=Past|VerbForm=Cop": {"pos": "SCONJ", "Tense": "past", "Other": {"VerbForm": "cop"}},
+    "SCONJ__VerbForm=Cop": {"pos": "SCONJ", "Other": {"VerbForm": "cop"}},
+    "SYM__Abbr=Yes": {"pos": "SYM", "Other": {"Abbr": "yes"}},
+    "VERB__Case=NomAcc|Gender=Masc|Mood=Ind|Number=Sing|Tense=Pres": {"pos": "VERB", "Case": "nom|acc", "Gender": "masc", "Mood": "ind", "Number": "sing", "Tense": "pres"},
+    "VERB__Dialect=Munster|Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Dialect": "munster", "Form": "len"}},
+    "VERB__Foreign=Yes": {"pos": "VERB", "Foreign": "yes"},
+    "VERB__Form=Ecl|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Other": {"Form": "ecl"}},
+    "VERB__Form=Ecl|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Other": {"Form": "ecl"}},
+    "VERB__Form=Ecl|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "ecl"}},
+    "VERB__Form=Ecl|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "ecl", "Voice": "auto"}},
+    "VERB__Form=Ecl|Mood=Imp|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "ecl"}},
+    "VERB__Form=Ecl|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "ecl"}},
+    "VERB__Form=Ecl|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Other": {"Form": "ecl"}},
+    "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "ecl"}},
+    "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Other": {"Form": "ecl"}},
+    "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "ecl"}},
+    "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "ecl", "Voice": "auto"}},
+    "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "ecl"}},
+    "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "ecl"}},
+    "VERB__Form=Ecl|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "ecl"}},
+    "VERB__Form=Ecl|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "ecl", "Voice": "auto"}},
+    "VERB__Form=Ecl|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "ecl"}},
+    "VERB__Form=Ecl|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "ecl"}},
+    "VERB__Form=Ecl|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "ecl", "Voice": "auto"}},
+    "VERB__Form=Ecl|Mood=Sub|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Tense": "pres", "Other": {"Form": "ecl"}},
+    "VERB__Form=Ecl": {"pos": "VERB", "Other": {"Form": "ecl"}},
+    "VERB__Form=Emp|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Other": {"Form": "emp"}},
+    "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|PronType=Rel|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "PronType": "rel", "Tense": "pres", "Other": {"Form": "emp"}},
+    "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Other": {"Form": "emp"}},
+    "VERB__Form=Len|Mood=Cnd|Number=Plur|Person=3": {"pos": "VERB", "Mood": "cnd", "Number": "plur", "Person": 3, "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=2": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 2, "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "len", "Voice": "auto"}},
+    "VERB__Form=Len|Mood=Imp|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 3, "Tense": "past", "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Imp|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}},
+    "VERB__Form=Len|Mood=Imp|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Other": {"Form": "len", "Voice": "auto"}},
+    "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "fut", "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "past", "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Ind|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 3, "Tense": "past", "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "past", "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "len", "Voice": "auto"}},
+    "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}},
+    "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len", "Voice": "auto"}},
+    "VERB__Form=Len|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "len", "Voice": "auto"}},
+    "VERB__Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}},
+    "VERB__Form=Len|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "len"}},
+    "VERB__Form=Len|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "len", "Voice": "auto"}},
+    "VERB__Form=Len|Mood=Sub|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}},
+    "VERB__Form=Len|Polarity=Neg": {"pos": "VERB", "Polarity": "neg", "Other": {"Form": "len"}},
+    "VERB__Form=Len": {"pos": "VERB", "Other": {"Form": "len"}},
+    "VERB__Mood=Cnd|Number=Plur|Person=3": {"pos": "VERB", "Mood": "cnd", "Number": "plur", "Person": 3},
+    "VERB__Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1},
+    "VERB__Mood=Cnd": {"pos": "VERB", "Mood": "cnd"},
+    "VERB__Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Voice": "auto"}},
+    "VERB__Mood=Imp|Number=Plur|Person=1|Polarity=Neg": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 1, "Polarity": "neg"},
+    "VERB__Mood=Imp|Number=Plur|Person=1": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 1},
+    "VERB__Mood=Imp|Number=Plur|Person=2": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 2},
+    "VERB__Mood=Imp|Number=Sing|Person=2": {"pos": "VERB", "Mood": "imp", "Number": "sing", "Person": 2},
+    "VERB__Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past"},
+    "VERB__Mood=Ind|Number=Plur|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "past"},
+    "VERB__Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres"},
+    "VERB__Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past"},
+    "VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres"},
+    "VERB__Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Voice": "auto"}},
+    "VERB__Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres"},
+    "VERB__Mood=Ind|PronType=Rel|Tense=Fut": {"pos": "VERB", "Mood": "ind", "PronType": "rel", "Tense": "fut"},
+    "VERB__Mood=Ind|PronType=Rel|Tense=Pres": {"pos": "VERB", "Mood": "ind", "PronType": "rel", "Tense": "pres"},
+    "VERB__Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut"},
+    "VERB__Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Voice": "auto"}},
+    "VERB__Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past"},
+    "VERB__Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Voice": "auto"}},
+    "VERB__Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres"},
+    "VERB__Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Voice": "auto"}},
+    "VERB___": {"pos": "VERB"},
+    "X__Abbr=Yes": {"pos": "X", "Other": {"Abbr": "yes"}},
+    "X__Case=NomAcc|Foreign=Yes|Gender=Fem|Number=Sing": {"pos": "X", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Foreign": "yes"},
+    "X__Definite=Def|Dialect=Ulster": {"pos": "X", "Definite": "def", "Other": {"Dialect": "ulster"}},
+    "X__Dialect=Munster|Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "X", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Dialect": "munster", "Form": "len"}},
+    "X__Dialect=Munster|Mood=Imp|Number=Sing|Person=2|Polarity=Neg": {"pos": "X", "Mood": "imp", "Number": "sing", "Person": 2, "Polarity": "neg", "Other": {"Dialect": "munster"}},
+    "X__Dialect=Munster|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "X", "Mood": "ind", "Tense": "past", "Other": {"Dialect": "munster", "Voice": "auto"}},
+    "X__Dialect=Munster": {"pos": "X", "Other": {"Dialect": "munster"}},
+    "X__Dialect=Munster|PronType=Dem": {"pos": "X", "PronType": "dem", "Other": {"Dialect": "munster"}},
+    "X__Dialect=Ulster|Gender=Masc|Number=Sing|Person=3": {"pos": "X", "Gender": "masc", "Number": "sing", "Person": 3, "Other": {"Dialect": "ulster"}},
+    "X__Dialect=Ulster|PartType=Vb|Polarity=Neg": {"pos": "X", "Polarity": "neg", "Other": {"Dialect": "ulster", "PartType": "vb"}},
+    "X__Dialect=Ulster|VerbForm=Cop": {"pos": "X", "Other": {"Dialect": "ulster", "VerbForm": "cop"}},
+    "X__Foreign=Yes": {"pos": "X", "Foreign": "yes"},
+    "X___": {"pos": "X"}
+}
\ No newline at end of file

From c069b4acb5317098d95d753a30160e3b52bbb209 Mon Sep 17 00:00:00 2001
From: Jim O'Regan <jaoregan@tcd.ie>
Date: Tue, 8 Aug 2017 19:22:14 +0100
Subject: [PATCH 12/90] fix in UD submitted; map either way

---
 spacy/lang/ga/tag_map.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/ga/tag_map.py b/spacy/lang/ga/tag_map.py
index 598d368bb..22a6bacd0 100644
--- a/spacy/lang/ga/tag_map.py
+++ b/spacy/lang/ga/tag_map.py
@@ -25,7 +25,9 @@ TAG_MAP = {
     "ADJ__Foreign=Yes": {"pos": "ADJ", "Foreign": "yes"},
     "ADJ__Form=Len|VerbForm=Part": {"pos": "ADJ", "VerbForm": "part", "Other": {"Form": "len"}},
     "ADJ__Gender=Masc|Number=Sing|PartType=Voc": {"pos": "ADJ", "Gender": "masc", "Number": "sing", "Case": "voc"},
+    "ADJ__Gender=Masc|Number=Sing|Case=Voc": {"pos": "ADJ", "Gender": "masc", "Number": "sing", "Case": "voc"},
     "ADJ__Number=Plur|PartType=Voc": {"pos": "ADJ", "Number": "plur", "Case": "voc"},
+    "ADJ__Number=Plur|Case=Voc": {"pos": "ADJ", "Number": "plur", "Case": "voc"},
     "ADJ__Number=Plur": {"pos": "ADJ", "Number": "plur"},
     "ADJ___": {"pos": "ADJ"},
     "ADJ__VerbForm=Part": {"pos": "ADJ", "VerbForm": "part"},
@@ -363,4 +365,4 @@ TAG_MAP = {
     "X__Dialect=Ulster|VerbForm=Cop": {"pos": "X", "Other": {"Dialect": "ulster", "VerbForm": "cop"}},
     "X__Foreign=Yes": {"pos": "X", "Foreign": "yes"},
     "X___": {"pos": "X"}
-}
\ No newline at end of file
+}

From c283e9edfe9618e5b48193dad4b0b1844ffee72a Mon Sep 17 00:00:00 2001
From: Jim O'Regan <jaoregan@tcd.ie>
Date: Mon, 11 Sep 2017 08:57:48 +0100
Subject: [PATCH 13/90] first stab at test

---
 spacy/tests/lang/ga/__init__.py       |  0
 spacy/tests/lang/ga/test_tokenizer.py | 18 ++++++++++++++++++
 2 files changed, 18 insertions(+)
 create mode 100644 spacy/tests/lang/ga/__init__.py
 create mode 100644 spacy/tests/lang/ga/test_tokenizer.py

diff --git a/spacy/tests/lang/ga/__init__.py b/spacy/tests/lang/ga/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py
new file mode 100644
index 000000000..fe5cb0b2f
--- /dev/null
+++ b/spacy/tests/lang/ga/test_tokenizer.py
@@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+
+SV_TOKEN_EXCEPTION_TESTS = [
+    ('B\'fhearr fanacht as amharc', ['B\'', 'fhearr', 'fanacht', 'as', 'amharc']),
+    ('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise'])
+]
+
+
+@pytest.mark.parametrize('text,expected_tokens', GA_TOKEN_EXCEPTION_TESTS)
+def test_tokenizer_handles_exception_cases(ga_tokenizer, text, expected_tokens):
+    tokens = ga_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
+

From 187be6d372c8ef86c77483a6558f7592c3d0a2dc Mon Sep 17 00:00:00 2001
From: Jim O'Regan <jaoregan@tcd.ie>
Date: Mon, 11 Sep 2017 09:33:17 +0100
Subject: [PATCH 14/90] copy/paste error

---
 spacy/tests/lang/ga/test_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py
index fe5cb0b2f..5b45dddc1 100644
--- a/spacy/tests/lang/ga/test_tokenizer.py
+++ b/spacy/tests/lang/ga/test_tokenizer.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
 import pytest
 
 
-SV_TOKEN_EXCEPTION_TESTS = [
+GA_TOKEN_EXCEPTION_TESTS = [
     ('B\'fhearr fanacht as amharc', ['B\'', 'fhearr', 'fanacht', 'as', 'amharc']),
     ('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise'])
 ]

From 9dfd30196289536bf0bbc029d1b0d36c0adbc190 Mon Sep 17 00:00:00 2001
From: Jim O'Regan <jaoregan@tcd.ie>
Date: Mon, 11 Sep 2017 10:14:18 +0100
Subject: [PATCH 15/90] rearrange

---
 spacy/lang/ga/__init__.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py
index 7b72a8a91..38b73468f 100644
--- a/spacy/lang/ga/__init__.py
+++ b/spacy/lang/ga/__init__.py
@@ -10,15 +10,16 @@ from ...attrs import LANG
 from ...util import update_exc
 
 
+class IrishDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'ga'
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+
 class Irish(Language):
     lang = 'ga'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'ga'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
+    Defaults = IrishDefaults
 
 
 __all__ = ['Irish']

From b1b6123867209d18cfd5ab958731aac997f4f0d6 Mon Sep 17 00:00:00 2001
From: Jim O'Regan <jaoregan@tcd.ie>
Date: Mon, 11 Sep 2017 10:31:41 +0100
Subject: [PATCH 16/90] add ga_tokenizer

---
 spacy/tests/conftest.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index f5d65803a..1e9838d41 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -99,6 +99,10 @@ def sv_tokenizer():
 def bn_tokenizer():
     return util.get_lang_class('bn').Defaults.create_tokenizer()
 
+@pytest.fixture
+def ga_tokenizer():
+    return util.get_lang_class('ga').Defaults.create_tokenizer()
+
 
 @pytest.fixture
 def he_tokenizer():

From 7de709483bd9df2890672f2d17d8277d684d07d2 Mon Sep 17 00:00:00 2001
From: Jim O'Regan <jaoregan@tcd.ie>
Date: Mon, 11 Sep 2017 10:51:21 +0100
Subject: [PATCH 17/90] missed adding here

---
 spacy/tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 1e9838d41..4da1ae301 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -12,7 +12,7 @@ from .. import util
 
 
 _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
-              'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
+              'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'ga', 'xx']
 _models = {'en': ['en_core_web_sm'],
            'de': ['de_core_news_md'],
            'fr': ['fr_depvec_web_lg'],

From 8db3da3c3dbe70687ba39030b2fa513cb74d8749 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 14:06:25 +0100
Subject: [PATCH 18/90] Refactor JS, split into modules and add nomodule option

rollup.js will be compiled by the rollup package and Babel on build, and will be loaded if a browser doesn't yet support JS modules
---
 website/_harp.json                            |   4 +-
 website/_includes/_scripts.jade               |  81 +++--
 website/assets/js/changelog.js                |  72 ++++
 website/assets/js/github-embed.js             |  36 ++
 website/assets/js/main.js                     | 323 ------------------
 website/assets/js/models.js                   | 160 +++++++++
 website/assets/js/nav-highlighter.js          |  33 ++
 website/assets/js/progress.js                 |  52 +++
 website/assets/js/rollup.js                   |  23 ++
 website/assets/js/util.js                     |  56 +++
 website/assets/js/{ => vendor}/chart.min.js   |   0
 website/assets/js/{ => vendor}/in-view.min.js |   0
 website/assets/js/{ => vendor}/prism.min.js   |   0
 .../assets/js/{ => vendor}/quickstart.min.js  |   0
 14 files changed, 493 insertions(+), 347 deletions(-)
 create mode 100644 website/assets/js/changelog.js
 create mode 100644 website/assets/js/github-embed.js
 delete mode 100644 website/assets/js/main.js
 create mode 100644 website/assets/js/models.js
 create mode 100644 website/assets/js/nav-highlighter.js
 create mode 100644 website/assets/js/progress.js
 create mode 100644 website/assets/js/rollup.js
 create mode 100644 website/assets/js/util.js
 rename website/assets/js/{ => vendor}/chart.min.js (100%)
 rename website/assets/js/{ => vendor}/in-view.min.js (100%)
 rename website/assets/js/{ => vendor}/prism.min.js (100%)
 rename website/assets/js/{ => vendor}/quickstart.min.js (100%)

diff --git a/website/_harp.json b/website/_harp.json
index 7c69beef0..bc1a0b5e5 100644
--- a/website/_harp.json
+++ b/website/_harp.json
@@ -84,8 +84,8 @@
         ],
 
         "ALPHA": true,
-        "V_CSS": "2.0a1",
-        "V_JS": "2.0a0",
+        "V_CSS": "2.0a2",
+        "V_JS": "2.0a1",
         "DEFAULT_SYNTAX": "python",
         "ANALYTICS": "UA-58931649-1",
         "MAILCHIMP": {
diff --git a/website/_includes/_scripts.jade b/website/_includes/_scripts.jade
index 5ecdd0711..e1d9f773a 100644
--- a/website/_includes/_scripts.jade
+++ b/website/_includes/_scripts.jade
@@ -1,43 +1,80 @@
 //- 💫 INCLUDES > SCRIPTS
 
 if quickstart
-        script(src="/assets/js/quickstart.min.js")
+    script(src="/assets/js/vendor/quickstart.min.js")
 
 if IS_PAGE
-    script(src="/assets/js/in-view.min.js")
+    script(src="/assets/js/vendor/in-view.min.js")
 
 if environment == "deploy"
     script(async src="https://www.google-analytics.com/analytics.js")
 
-script(src="/assets/js/prism.min.js")
-script(src="/assets/js/main.js?v#{V_JS}")
+script(src="/assets/js/vendor/prism.min.js")
+
+if SECTION == "models"
+    script(src="/assets/js/vendor/chart.min.js")
+    script(src="/assets/js/models.js?v#{V_JS}" type="module")
 
 script
-    | new ProgressBar('.js-progress');
-
-    if changelog
-        | new Changelog('!{SOCIAL.github}', 'spacy');
-
     if quickstart
         | new Quickstart("#qs");
 
-    if IS_PAGE
-        | new SectionHighlighter('data-section', 'data-nav');
-        | new GitHubEmbed('!{SOCIAL.github}', 'data-gh-embed');
-        | ((window.gitter = {}).chat = {}).options = {
-        |     useStyles: false,
-        |     activationElement: '.js-gitter-button',
-        |     targetElement: '.js-gitter',
-        |     room: '!{SOCIAL.gitter}'
-        | };
-
-    if HAS_MODELS
-        | new ModelLoader('!{MODELS_REPO}', !{JSON.stringify(CURRENT_MODELS)}, !{JSON.stringify(MODEL_LICENSES)}, !{JSON.stringify(MODEL_BENCHMARKS)});
-
     if environment == "deploy"
         | window.ga=window.ga||function(){
         | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
         | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
 
+
 if IS_PAGE
+    script
+        | ((window.gitter = {}).chat = {}).options = {
+        |     useStyles: false,
+        |     activationElement: '.js-gitter-button',
+        |     targetElement: '.js-gitter',
+        |     room: '!{SOCIAL.gitter}'
+        | };
     script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
+
+
+//- JS modules – slightly hacky, but necessary to dynamically instantiate the
+    classes with data from the Harp JSON files, while still being able to
+    support older browsers that can't handle JS modules. More details:
+    https://medium.com/dev-channel/es6-modules-in-chrome-canary-m60-ba588dfb8ab7
+
+- ProgressBar = "new ProgressBar('.js-progress');"
+- Changelog = "new Changelog('" + SOCIAL.github + "', 'spacy');"
+- NavHighlighter = "new NavHighlighter('data-section', 'data-nav');"
+- GitHubEmbed = "new GitHubEmbed('" + SOCIAL.github + "', 'data-gh-embed');"
+- ModelLoader = "new ModelLoader('" + MODELS_REPO + "'," + JSON.stringify(CURRENT_MODELS) + "," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + ");"
+
+//- Browsers with JS module support.
+    Will be ignored otherwise.
+
+script(type="module")
+    | import ProgressBar from '/assets/js/progress.js';
+    !=ProgressBar
+    if changelog
+        | import Changelog from '/assets/js/changelog.js';
+        !=Changelog
+    if IS_PAGE
+        | import NavHighlighter from '/assets/js/nav-highlighter.js';
+        !=NavHighlighter
+        | import GitHubEmbed from '/assets/js/github-embed.js';
+        !=GitHubEmbed
+    if HAS_MODELS
+        | import { ModelLoader } from '/assets/js/models.js';
+        !=ModelLoader
+
+//- Browsers with no JS module support.
+    Won't be fetched or interpreted otherwise.
+
+script(nomodule src="/assets/js/rollup.js")
+script(nomodule)
+    !=ProgressBar
+    if changelog
+        !=Changelog
+    if IS_PAGE
+        !=NavHighlighter
+        !=GitHubEmbed
+    if HAS_MODELS
+        !=ModeLoader
diff --git a/website/assets/js/changelog.js b/website/assets/js/changelog.js
new file mode 100644
index 000000000..94f2149ad
--- /dev/null
+++ b/website/assets/js/changelog.js
@@ -0,0 +1,72 @@
+'use strict';
+
+import { Templater, handleResponse } from './util.js';
+
+export default class Changelog {
+    /**
+     * Fetch and render changelog from GitHub. Clones a template node (table row)
+     * to avoid doubling templating markup in JavaScript.
+     * @param {string} user - GitHub username.
+     * @param {string} repo - Repository to fetch releases from.
+     */
+    constructor(user, repo) {
+        this.url = `https://api.github.com/repos/${user}/${repo}/releases`;
+        this.template = new Templater('changelog');
+        this.fetchChangelog()
+            .then(json => this.render(json))
+            .catch(this.showError.bind(this));
+        // make sure scroll positions for progress bar etc. are recalculated
+        window.dispatchEvent(new Event('resize'));
+    }
+
+    fetchChangelog() {
+        return new Promise((resolve, reject) =>
+            fetch(this.url)
+                .then(res => handleResponse(res))
+                .then(json => json.ok ? resolve(json) : reject()))
+    }
+
+    showError() {
+        this.template.get('error').style.display = 'block';
+    }
+
+    /**
+     * Get template section from template row. Hacky, but does make sense.
+     * @param {node} item - Parent element.
+     * @param {string} id - ID of child element, set via data-changelog.
+     */
+    getField(item, id) {
+        return item.querySelector(`[data-changelog="${id}"]`);
+    }
+
+    render(json) {
+        this.template.get('table').style.display = 'block';
+        this.row = this.template.get('item');
+        this.releases = this.template.get('releases');
+        this.prereleases = this.template.get('prereleases');
+        Object.values(json)
+            .filter(release => release.name)
+            .forEach(release => this.renderRelease(release));
+        this.row.remove();
+    }
+
+    /**
+     * Clone the template row and populate with content from API response.
+     * https://developer.github.com/v3/repos/releases/#list-releases-for-a-repository
+     * @param {string} name - Release title.
+     * @param {string} tag (tag_name) - Release tag.
+     * @param {string} url (html_url) - URL to the release page on GitHub.
+     * @param {string} date (published_at) - Timestamp of release publication.
+     * @param {boolean} prerelease - Whether the release is a prerelease.
+     */
+    renderRelease({ name, tag_name: tag, html_url: url, published_at: date, prerelease }) {
+        const container = prerelease ? this.prereleases : this.releases;
+        const tagLink = `<a href="${url}" target="_blank"><code>${tag}</code></a>`;
+        const title = (name.split(': ').length == 2) ? name.split(': ')[1] : name;
+        const row = this.row.cloneNode(true);
+        this.getField(row, 'date').textContent = date.split('T')[0];
+        this.getField(row, 'tag').innerHTML = tagLink;
+        this.getField(row, 'title').textContent = title;
+        container.appendChild(row);
+    }
+}
diff --git a/website/assets/js/github-embed.js b/website/assets/js/github-embed.js
new file mode 100644
index 000000000..58e80ee1a
--- /dev/null
+++ b/website/assets/js/github-embed.js
@@ -0,0 +1,36 @@
+'use strict';
+
+import { $$ } from './util.js';
+
+export default class GitHubEmbed {
+    /**
+     * Embed code from GitHub repositories, similar to Gist embeds. Fetches the
+     * raw text and places it inside element.
+     * Usage: <pre><code data-gh-embed="spacy/master/examples/x.py"></code><pre>
+     * @param {string} user - GitHub user or organization.
+     * @param {string} attr - Data attribute used to select containers. Attribute
+     *                        value should be path to file relative to user.
+     */
+    constructor(user, attr) {
+        this.url = `https://raw.githubusercontent.com/${user}`;
+        this.attr = attr;
+        this.error = `\nCan't fetch code example from GitHub :(\n\nPlease use the link below to view the example. If you've come across\na broken link, we always appreciate a pull request to the repository,\nor a report on the issue tracker. Thanks!`;
+        [...$$(`[${this.attr}]`)].forEach(el => this.embed(el));
+    }
+
+    /**
+     * Fetch code from GitHub and insert it as element content. File path is
+     * read off the container's data attribute.
+     * @param {node} el - The element.
+     */
+    embed(el) {
+        el.parentElement.setAttribute('data-loading', '');
+        fetch(`${this.url}/${el.getAttribute(this.attr)}`)
+            .then(res => res.text().then(text => ({ text, ok: res.ok })))
+            .then(({ text, ok }) => {
+                el.textContent = ok ? text : this.error;
+                if (ok && window.Prism) Prism.highlightElement(el);
+            })
+        el.parentElement.removeAttribute('data-loading');
+    }
+}
diff --git a/website/assets/js/main.js b/website/assets/js/main.js
deleted file mode 100644
index d9465bb67..000000000
--- a/website/assets/js/main.js
+++ /dev/null
@@ -1,323 +0,0 @@
-//- 💫 MAIN JAVASCRIPT
-//- Note: Will be compiled using Babel before deployment.
-
-'use strict'
-
-const $ = document.querySelector.bind(document);
-const $$ = document.querySelectorAll.bind(document);
-
-
-class ProgressBar {
-    /**
-     * Animated reading progress bar.
-     * @param {String} selector – CSS selector of progress bar element.
-     */
-    constructor(selector) {
-        this.el = $(selector);
-        this.scrollY = 0;
-        this.sizes = this.updateSizes();
-        this.el.setAttribute('max', 100);
-        this.init();
-    }
-
-    init() {
-        window.addEventListener('scroll', () => {
-            this.scrollY = (window.pageYOffset || document.scrollTop) - (document.clientTop || 0);
-            requestAnimationFrame(this.update.bind(this));
-        }, false);
-        window.addEventListener('resize', () => {
-            this.sizes = this.updateSizes();
-            requestAnimationFrame(this.update.bind(this));
-        })
-    }
-
-    update() {
-        const offset = 100 - ((this.sizes.height - this.scrollY - this.sizes.vh) / this.sizes.height * 100);
-        this.el.setAttribute('value', (this.scrollY == 0) ? 0 : offset || 0);
-    }
-
-    updateSizes() {
-        const body = document.body;
-        const html = document.documentElement;
-        return {
-            height: Math.max(body.scrollHeight, body.offsetHeight, html.clientHeight, html.scrollHeight, html.offsetHeight),
-            vh: Math.max(html.clientHeight, window.innerHeight || 0)
-        }
-    }
-}
-
-
-class SectionHighlighter {
-    /**
-     * Hightlight section in viewport in sidebar, using in-view library.
-     * @param {String} sectionAttr - Data attribute of sections.
-     * @param {String} navAttr - Data attribute of navigation items.
-     * @param {String} activeClass – Class name of active element.
-     */
-    constructor(sectionAttr, navAttr, activeClass = 'is-active') {
-        this.sections = [...$$(`[${navAttr}]`)];
-        this.navAttr = navAttr;
-        this.sectionAttr = sectionAttr;
-        this.activeClass = activeClass;
-        inView(`[${sectionAttr}]`).on('enter', this.highlightSection.bind(this));
-    }
-
-    highlightSection(section) {
-        const id = section.getAttribute(this.sectionAttr);
-        const el = $(`[${this.navAttr}="${id}"]`);
-        if (el) {
-            this.sections.forEach(el => el.classList.remove(this.activeClass));
-            el.classList.add(this.activeClass);
-        }
-    }
-}
-
-
-class Templater {
-    /**
-     * Mini templating engine based on data attributes. Selects elements based
-     * on a data-tpl and data-tpl-key attribute and can set textContent
-     * and innterHtml.
-     *
-     * @param {String} templateId - Template section, e.g. value of data-tpl.
-     */
-    constructor(templateId) {
-        this.templateId = templateId;
-    }
-
-    get(key) {
-        return $(`[data-tpl="${this.templateId}"][data-tpl-key="${key}"]`);
-    }
-
-    fill(key, value, html = false) {
-        const el = this.get(key);
-        if (html) el.innerHTML = value || '';
-        else el.textContent = value || '';
-        return el;
-    }
-}
-
-
-class ModelLoader {
-    /**
-     * Load model meta from GitHub and update model details on site. Uses the
-     * Templater mini template engine to update DOM.
-     *
-     * @param {String} repo - Path tp GitHub repository containing releases.
-     * @param {Array} models - List of model IDs, e.g. "en_core_web_sm".
-     * @param {Object} licenses - License IDs mapped to URLs.
-     * @param {Object} accKeys - Available accuracy keys mapped to display labels.
-     */
-    constructor(repo, models = [], licenses = {}, benchmarkKeys = {}) {
-        this.url = `https://raw.githubusercontent.com/${repo}/master`;
-        this.repo = `https://github.com/${repo}`;
-        this.modelIds = models;
-        this.licenses = licenses;
-        this.benchKeys = benchmarkKeys;
-        this.init();
-    }
-
-    init() {
-        this.modelIds.forEach(modelId =>
-            new Templater(modelId).get('table').setAttribute('data-loading', ''));
-        fetch(`${this.url}/compatibility.json`)
-            .then(res => this.handleResponse(res))
-            .then(json => json.ok ? this.getModels(json['spacy']) : this.modelIds.forEach(modelId => this.showError(modelId)))
-    }
-
-    handleResponse(res) {
-        if (res.ok) return res.json().then(json => Object.assign({}, json, { ok: res.ok }))
-        else return ({ ok: res.ok })
-    }
-
-    convertNumber(num, separator = ',') {
-        return num.toString().replace(/\B(?=(\d{3})+(?!\d))/g, separator);
-    }
-
-    getModels(compat) {
-        this.compat = compat;
-        for (let modelId of this.modelIds) {
-            const version = this.getLatestVersion(modelId, compat);
-            if (!version) {
-                this.showError(modelId); return;
-            }
-            fetch(`${this.url}/meta/${modelId}-${version}.json`)
-                .then(res => this.handleResponse(res))
-                .then(json => json.ok ? this.render(json) : this.showError(modelId))
-        }
-        // make sure scroll positions for progress bar etc. are recalculated
-        window.dispatchEvent(new Event('resize'));
-    }
-
-    showError(modelId) {
-        const template = new Templater(modelId);
-        template.get('table').removeAttribute('data-loading');
-        template.get('error').style.display = 'block';
-        for (let key of ['sources', 'pipeline', 'vectors', 'author', 'license']) {
-            template.get(key).parentElement.parentElement.style.display = 'none';
-        }
-    }
-
-    /**
-     * Update model details in tables. Currently quite hacky :(
-     */
-    render({ lang, name, version, sources, pipeline, vectors, url, author, license, accuracy, speed, size, description, notes }) {
-        const modelId = `${lang}_${name}`;
-        const model = `${modelId}-${version}`;
-        const template = new Templater(modelId);
-
-        const getSources = s => (s instanceof Array) ? s.join(', ') : s;
-        const getPipeline = p => p.map(comp => `<code>${comp}</code>`).join(', ');
-        const getVectors = v => `${this.convertNumber(v.entries)} (${v.width} dimensions)`;
-        const getLink = (t, l) => `<a href="${l}" target="_blank">${t}</a>`;
-
-        const keys = { version, size, description, notes }
-        Object.keys(keys).forEach(key => template.fill(key, keys[key]));
-
-        if (sources) template.fill('sources', getSources(sources));
-        if (pipeline && pipeline.length) template.fill('pipeline', getPipeline(pipeline), true);
-        else template.get('pipeline').parentElement.parentElement.style.display = 'none';
-        if (vectors) template.fill('vectors', getVectors(vectors));
-        else template.get('vectors').parentElement.parentElement.style.display = 'none';
-
-        if (author) template.fill('author', url ? getLink(author, url) : author, true);
-        if (license) template.fill('license', this.licenses[license] ? getLink(license, this.licenses[license]) : license, true);
-
-        template.get('download').setAttribute('href', `${this.repo}/releases/tag/${model}`);
-
-        this.renderBenchmarks(template, accuracy, speed);
-        this.renderCompat(template, modelId);
-        template.get('table').removeAttribute('data-loading');
-    }
-
-    renderBenchmarks(template, accuracy = {}, speed = {}) {
-        if (!accuracy && !speed) return;
-        template.get('benchmarks').style.display = 'block';
-        this.renderTable(template, 'parser', accuracy, val => val.toFixed(2));
-        this.renderTable(template, 'ner', accuracy, val => val.toFixed(2));
-        this.renderTable(template, 'speed', speed, Math.round);
-    }
-
-    renderTable(template, id, benchmarks, convertVal = val => val) {
-        if (!this.benchKeys[id] || !Object.keys(this.benchKeys[id]).some(key => benchmarks[key])) return;
-        const keys = Object.keys(this.benchKeys[id]).map(k => benchmarks[k] ? k : false).filter(k => k);
-        template.get(id).style.display = 'block';
-        for (let key of keys) {
-            template
-                .fill(key, this.convertNumber(convertVal(benchmarks[key])))
-                .parentElement.style.display = 'table-row';
-        }
-    }
-
-    renderCompat(template, modelId) {
-        template.get('compat-wrapper').style.display = 'table-row';
-        const options = Object.keys(this.compat).map(v => `<option value="${v}">v${v}</option>`).join('');
-        template
-            .fill('compat', '<option selected disabled>spaCy version</option>' + options, true)
-            .addEventListener('change', ev => {
-                const result = this.compat[ev.target.value][modelId];
-                if (result) template.fill('compat-versions', `<code>${modelId}-${result[0]}</code>`, true);
-                else template.fill('compat-versions', '');
-            });
-    }
-
-    getLatestVersion(model, compat = {}) {
-        for (let spacy_v of Object.keys(compat)) {
-            const models = compat[spacy_v];
-            if (models[model]) return models[model][0];
-        }
-    }
-}
-
-
-class Changelog {
-    /**
-     * Fetch and render changelog from GitHub. Clones a template node (table row)
-     * to avoid doubling templating markup in JavaScript.
-     *
-     * @param {String} user - GitHub username.
-     * @param {String} repo - Repository to fetch releases from.
-     */
-    constructor(user, repo) {
-        this.url = `https://api.github.com/repos/${user}/${repo}/releases`;
-        this.template = new Templater('changelog');
-        fetch(this.url)
-            .then(res => this.handleResponse(res))
-            .then(json => json.ok ? this.render(json) : false)
-    }
-
-    /**
-     * Get template section from template row. Slightly hacky, but does make sense.
-     */
-    $(item, id) {
-        return item.querySelector(`[data-changelog="${id}"]`);
-    }
-
-    handleResponse(res) {
-        if (res.ok) return res.json().then(json => Object.assign({}, json, { ok: res.ok }))
-        else return ({ ok: res.ok })
-    }
-
-    render(json) {
-        this.template.get('error').style.display = 'none';
-        this.template.get('table').style.display = 'block';
-        this.row = this.template.get('item');
-        this.releases = this.template.get('releases');
-        this.prereleases = this.template.get('prereleases');
-        Object.values(json)
-            .filter(release => release.name)
-            .forEach(release => this.renderRelease(release));
-        this.row.remove();
-        // make sure scroll positions for progress bar etc. are recalculated
-        window.dispatchEvent(new Event('resize'));
-    }
-
-    /**
-     * Clone the template row and populate with content from API response.
-     * https://developer.github.com/v3/repos/releases/#list-releases-for-a-repository
-     *
-     * @param {String} name - Release title.
-     * @param {String} tag (tag_name) - Release tag.
-     * @param {String} url (html_url) - URL to the release page on GitHub.
-     * @param {String} date (published_at) - Timestamp of release publication.
-     * @param {Boolean} pre (prerelease) - Whether the release is a prerelease.
-     */
-    renderRelease({ name, tag_name: tag, html_url: url, published_at: date, prerelease: pre }) {
-        const container = pre ? this.prereleases : this.releases;
-        const row = this.row.cloneNode(true);
-        this.$(row, 'date').textContent = date.split('T')[0];
-        this.$(row, 'tag').innerHTML = `<a href="${url}" target="_blank"><code>${tag}</code></a>`;
-        this.$(row, 'title').textContent = (name.split(': ').length == 2) ? name.split(': ')[1] : name;
-        container.appendChild(row);
-    }
-}
-
-
-class GitHubEmbed {
-    /**
-     * Embed code from GitHub repositories, similar to Gist embeds. Fetches the
-     * raw text and places it inside element.
-     * Usage: <pre><code data-gh-embed="spacy/master/examples/x.py"></code><pre>
-     *
-     * @param {String} user - GitHub user or organization.
-     * @param {String} attr - Data attribute used to select containers. Attribute
-     *                        value should be path to file relative to user.
-     */
-    constructor(user, attr) {
-        this.url = `https://raw.githubusercontent.com/${user}`;
-        this.attr = attr;
-        this.error = `\nCan't fetch code example from GitHub :(\n\nPlease use the link below to view the example. If you've come across\na broken link, we always appreciate a pull request to the repository,\nor a report on the issue tracker. Thanks!`;
-        [...$$(`[${this.attr}]`)].forEach(el => this.embed(el));
-    }
-
-    embed(el) {
-        el.parentElement.setAttribute('data-loading', '');
-        fetch(`${this.url}/${el.getAttribute(this.attr)}`)
-            .then(res => res.text().then(text => ({ text, ok: res.ok })))
-            .then(({ text, ok }) => {
-                el.textContent = ok ? text : this.error;
-                if (ok && window.Prism) Prism.highlightElement(el);
-            })
-        el.parentElement.removeAttribute('data-loading');
-    }
-}
diff --git a/website/assets/js/models.js b/website/assets/js/models.js
new file mode 100644
index 000000000..5fe7ff54a
--- /dev/null
+++ b/website/assets/js/models.js
@@ -0,0 +1,160 @@
+'use strict';
+
+import { Templater, handleResponse, convertNumber } from './util.js';
+
+/**
+ * Chart.js defaults
+ */
+Chart.defaults.global.legend.position = 'bottom';
+Chart.defaults.global.defaultFontFamily = "-apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'";
+const CHART_COLORS = { model1: '#09a3d5', model2: '#066B8C' };
+
+/**
+ * Formatters for model details.
+ * @property {function} author – Format model author with optional link.
+ * @property {function} license - Format model license with optional link.
+ * @property {function} sources - Format training data sources (list or string).
+ * @property {function} pipeline - Format list of pipeline components.
+ * @property {function} vectors - Format vector data (entries and dimensions).
+ * @property {function} version - Format model version number.
+ */
+export const formats = {
+    author: (author, url) => url ? `<a href="${url}" target="_blank">${author}</a>` : author,
+    license: (license, url) => url ? `<a href="${url}" target="_blank">${license}</a>` : license,
+    sources: sources => (sources instanceof Array) ? sources.join(', ') : sources,
+    pipeline: pipes => (pipes && pipes.length) ? pipes.map(p => `<code>${p}</code>`).join(', ') : '-',
+    vectors: vec => vec ? `${convertNumber(vec.entries)} (${vec.width} dimensions)` : 'n/a',
+    version: version => `<code>v${version}</code>`
+};
+
+/**
+ * Find the latest version of a model in a compatibility table.
+ * @param {string} model - The model name.
+ * @param {Object} compat - Compatibility table, keyed by spaCy version.
+ */
+export const getLatestVersion = (model, compat = {}) => {
+    for (let [spacy_v, models] of Object.entries(compat)) {
+        if (models[model]) return models[model][0];
+    }
+};
+
+export class ModelLoader {
+    /**
+     * Load model meta from GitHub and update model details on site. Uses the
+     * Templater mini template engine to update DOM.
+     * @param {string} repo - Path tp GitHub repository containing releases.
+     * @param {Array} models - List of model IDs, e.g. "en_core_web_sm".
+     * @param {Object} licenses - License IDs mapped to URLs.
+     * @param {Object} benchmarkKeys - Objects of available keys by type, e.g.
+     *                                 'parser', 'ner', 'speed', mapped to labels.
+     */
+    constructor(repo, models = [], licenses = {}, benchmarkKeys = {}) {
+        this.url = `https://raw.githubusercontent.com/${repo}/master`;
+        this.repo = `https://github.com/${repo}`;
+        this.modelIds = models;
+        this.licenses = licenses;
+        this.benchKeys = benchmarkKeys;
+        this.init();
+    }
+
+    init() {
+        this.modelIds.forEach(modelId =>
+            new Templater(modelId).get('table').setAttribute('data-loading', ''));
+        this.fetch(`${this.url}/compatibility.json`)
+            .then(json => this.getModels(json.spacy))
+            .catch(_ => this.modelIds.forEach(modelId => this.showError(modelId)));
+        // make sure scroll positions for progress bar etc. are recalculated
+        window.dispatchEvent(new Event('resize'));
+    }
+
+    fetch(url) {
+        return new Promise((resolve, reject) =>
+            fetch(url).then(res => handleResponse(res))
+                .then(json => json.ok ? resolve(json) : reject()))
+    }
+
+    getModels(compat) {
+        this.compat = compat;
+        for (let modelId of this.modelIds) {
+            const version = getLatestVersion(modelId, compat);
+            if (version) this.fetch(`${this.url}/meta/${modelId}-${version}.json`)
+                .then(json => this.render(json))
+                .catch(_ => this.showError(modelId))
+            else this.showError(modelId);
+        }
+    }
+
+    showError(modelId) {
+        const tpl = new Templater(modelId);
+        tpl.get('table').removeAttribute('data-loading');
+        tpl.get('error').style.display = 'block';
+        for (let key of ['sources', 'pipeline', 'vectors', 'author', 'license']) {
+            tpl.get(key).parentElement.parentElement.style.display = 'none';
+        }
+    }
+
+    /**
+     * Update model details in tables. Currently quite hacky :(
+     */
+    render(data) {
+        const modelId = `${data.lang}_${data.name}`;
+        const model = `${modelId}-${data.version}`;
+        const tpl = new Templater(modelId);
+        this.renderDetails(tpl, data)
+        this.renderBenchmarks(tpl, data.accuracy, data.speed);
+        this.renderCompat(tpl, modelId);
+        tpl.get('download').setAttribute('href', `${this.repo}/releases/tag/${model}`);
+        tpl.get('table').removeAttribute('data-loading');
+    }
+
+    renderDetails(tpl, { version, size, description, notes, author, url,
+        license, sources, vectors, pipeline }) {
+        const basics = { version, size, description, notes }
+        for (let [key, value] of Object.entries(basics)) {
+            if (value) tpl.fill(key, value);
+        }
+        if (author) tpl.fill('author', formats.author(author, url), true);
+        if (license) tpl.fill('license', formats.license(license, this.licenses[license]), true);
+        if (sources) tpl.fill('sources', formats.sources(sources));
+        if (vectors) tpl.fill('vectors', formats.vectors(vectors));
+        else tpl.get('vectors').parentElement.parentElement.style.display = 'none';
+        if (pipeline && pipeline.length) tpl.fill('pipeline', formats.pipeline(pipeline), true);
+        else tpl.get('pipeline').parentElement.parentElement.style.display = 'none';
+    }
+
+    renderBenchmarks(tpl, accuracy = {}, speed = {}) {
+        if (!accuracy && !speed) return;
+        this.renderTable(tpl, 'parser', accuracy, val => val.toFixed(2));
+        this.renderTable(tpl, 'ner', accuracy, val => val.toFixed(2));
+        this.renderTable(tpl, 'speed', speed, Math.round);
+        tpl.get('benchmarks').style.display = 'block';
+    }
+
+    renderTable(tpl, id, benchmarks, converter = val => val) {
+        if (!this.benchKeys[id] || !Object.keys(this.benchKeys[id]).some(key => benchmarks[key])) return;
+        for (let key of Object.keys(this.benchKeys[id])) {
+            if (benchmarks[key]) tpl
+                .fill(key, convertNumber(converter(benchmarks[key])))
+                .parentElement.style.display = 'table-row';
+        }
+        tpl.get(id).style.display = 'block';
+    }
+
+    renderCompat(tpl, modelId) {
+        tpl.get('compat-wrapper').style.display = 'table-row';
+        const header = '<option selected disabled>spaCy version</option>';
+        const options = Object.keys(this.compat)
+            .map(v => `<option value="${v}">v${v}</option>`)
+            .join('');
+        tpl
+            .fill('compat', header + options, true)
+            .addEventListener('change', ({ target: { value }}) =>
+                tpl.fill('compat-versions', this.getCompat(value, modelId), true))
+    }
+
+    getCompat(version, model) {
+        const res = this.compat[version][model];
+        return res ? `<code>${model}-${res[0]}</code>` : '<em>not compatible</em>';
+    }
+}
+
diff --git a/website/assets/js/nav-highlighter.js b/website/assets/js/nav-highlighter.js
new file mode 100644
index 000000000..40f708e5e
--- /dev/null
+++ b/website/assets/js/nav-highlighter.js
@@ -0,0 +1,33 @@
+'use strict';
+
+import { $, $$ } from './util.js';
+
+export default class NavHighlighter {
+    /**
+     * Hightlight section in viewport in sidebar, using in-view library.
+     * @param {string} sectionAttr - Data attribute of sections.
+     * @param {string} navAttr - Data attribute of navigation items.
+     * @param {string} activeClass – Class name of active element.
+     */
+    constructor(sectionAttr, navAttr, activeClass = 'is-active') {
+        this.sections = [...$$(`[${navAttr}]`)];
+        this.navAttr = navAttr;
+        this.sectionAttr = sectionAttr;
+        this.activeClass = activeClass;
+        if (window.inView) inView(`[${sectionAttr}]`)
+            .on('enter', this.highlightSection.bind(this));
+    }
+
+    /**
+     * Check if section in view exists in sidebar and mark as active.
+     * @param {node} section - The section in view.
+     */
+    highlightSection(section) {
+        const id = section.getAttribute(this.sectionAttr);
+        const el = $(`[${this.navAttr}="${id}"]`);
+        if (el) {
+            this.sections.forEach(el => el.classList.remove(this.activeClass));
+            el.classList.add(this.activeClass);
+        }
+    }
+}
diff --git a/website/assets/js/progress.js b/website/assets/js/progress.js
new file mode 100644
index 000000000..1497547d8
--- /dev/null
+++ b/website/assets/js/progress.js
@@ -0,0 +1,52 @@
+'use strict';
+
+import { $ } from './util.js';
+
+export default class ProgressBar {
+    /**
+     * Animated reading progress bar.
+     * @param {string} selector – CSS selector of progress bar element.
+     */
+    constructor(selector) {
+        this.scrollY = 0;
+        this.sizes = this.updateSizes();
+        this.el = $(selector);
+        this.el.setAttribute('max', 100);
+        window.addEventListener('scroll', this.onScroll.bind(this));
+        window.addEventListener('resize', this.onResize.bind(this));
+    }
+
+    onScroll(ev) {
+        this.scrollY = (window.pageYOffset || document.scrollTop) - (document.clientTop || 0);
+        requestAnimationFrame(this.update.bind(this));
+    }
+
+    onResize(ev) {
+        this.sizes = this.updateSizes();
+        requestAnimationFrame(this.update.bind(this));
+    }
+
+    update() {
+        const offset = 100 - ((this.sizes.height - this.scrollY - this.sizes.vh) / this.sizes.height * 100);
+        this.el.setAttribute('value', (this.scrollY == 0) ? 0 : offset || 0);
+    }
+
+    /**
+     * Update scroll and viewport height. Called on load and window resize.
+     */
+    updateSizes() {
+        return {
+            height: Math.max(
+                document.body.scrollHeight,
+                document.body.offsetHeight,
+                document.documentElement.clientHeight,
+                document.documentElement.scrollHeight,
+                document.documentElement.offsetHeight
+            ),
+            vh: Math.max(
+                document.documentElement.clientHeight,
+                window.innerHeight || 0
+            )
+        }
+    }
+}
diff --git a/website/assets/js/rollup.js b/website/assets/js/rollup.js
new file mode 100644
index 000000000..00ff92fa9
--- /dev/null
+++ b/website/assets/js/rollup.js
@@ -0,0 +1,23 @@
+/**
+ * This file is bundled by Rollup, compiled with Babel and included as
+ * <script nomodule> for older browsers that don't yet support JavaScript
+ * modules. Browsers that do will ignore this bundle and won't even fetch it
+ * from the server. Details:
+ * https://github.com/rollup/rollup
+ * https://medium.com/dev-channel/es6-modules-in-chrome-canary-m60-ba588dfb8ab7
+ */
+
+// Import all modules that are instantiated directly in _includes/_scripts.jade
+import ProgressBar from './progress.js';
+import NavHighlighter from './nav-highlighter.js';
+import Changelog from './changelog.js';
+import GitHubEmbed from './github-embed.js';
+import { ModelLoader, ModelComparer } from './models.js';
+
+// Assign to window so they are bundled by rollup
+window.ProgressBar = ProgressBar;
+window.NavHighlighter = NavHighlighter;
+window.Changelog = Changelog;
+window.GitHubEmbed = GitHubEmbed;
+window.ModelLoader = ModelLoader;
+window.ModelComparer = ModelComparer;
diff --git a/website/assets/js/util.js b/website/assets/js/util.js
new file mode 100644
index 000000000..6bf14f578
--- /dev/null
+++ b/website/assets/js/util.js
@@ -0,0 +1,56 @@
+'use strict';
+
+export const $ = document.querySelector.bind(document);
+export const $$ = document.querySelectorAll.bind(document);
+
+export class Templater {
+    /**
+     * Mini templating engine based on data attributes. Selects elements based
+     * on a data-tpl and data-tpl-key attribute and can set textContent
+     * and innterHtml.
+     * @param {string} templateId - Template section, e.g. value of data-tpl.
+     */
+    constructor(templateId) {
+        this.templateId = templateId;
+    }
+
+    /**
+     * Get an element from the template and return it.
+     * @param {string} key - Name of the key within the current template.
+     */
+    get(key) {
+        return $(`[data-tpl="${this.templateId}"][data-tpl-key="${key}"]`);
+    }
+
+    /**
+     * Fill the content of a template element with a value.
+     * @param {string} key - Name of the key within the current template.
+     * @param {string} value - Content to insert into template element.
+     * @param {boolean} html - Insert content as HTML. Defaults to false.
+     */
+    fill(key, value, html = false) {
+        const el = this.get(key);
+        if (html) el.innerHTML = value || '';
+        else el.textContent = value || '';
+        return el;
+    }
+}
+
+/**
+ * Handle API response and assign status to returned JSON.
+ * @param {Response} res – The response.
+ */
+export const handleResponse = res => {
+    if (res.ok) return res.json()
+        .then(json => Object.assign({}, json, { ok: res.ok }))
+    else return ({ ok: res.ok })
+};
+
+
+/**
+ * Convert a number to a string and add thousand separator.
+ * @param {number|string} num - The number to convert.
+ * @param {string} separator – Thousand separator.
+ */
+export const convertNumber = (num, separator = ',') =>
+    num.toString().replace(/\B(?=(\d{3})+(?!\d))/g, separator);
diff --git a/website/assets/js/chart.min.js b/website/assets/js/vendor/chart.min.js
similarity index 100%
rename from website/assets/js/chart.min.js
rename to website/assets/js/vendor/chart.min.js
diff --git a/website/assets/js/in-view.min.js b/website/assets/js/vendor/in-view.min.js
similarity index 100%
rename from website/assets/js/in-view.min.js
rename to website/assets/js/vendor/in-view.min.js
diff --git a/website/assets/js/prism.min.js b/website/assets/js/vendor/prism.min.js
similarity index 100%
rename from website/assets/js/prism.min.js
rename to website/assets/js/vendor/prism.min.js
diff --git a/website/assets/js/quickstart.min.js b/website/assets/js/vendor/quickstart.min.js
similarity index 100%
rename from website/assets/js/quickstart.min.js
rename to website/assets/js/vendor/quickstart.min.js

From 38ef4274b6c921f5890cec6f4040be2675eeb0ac Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 14:07:42 +0100
Subject: [PATCH 19/90] Remove confusing icon for non-compatible models

ModelLoader will now output "not compatible" if no compatible version of model is found for a spaCy version
---
 website/_includes/_page_models.jade    | 2 +-
 website/assets/css/_base/_objects.sass | 9 +--------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/website/_includes/_page_models.jade b/website/_includes/_page_models.jade
index 10e7e1746..b220f8d35 100644
--- a/website/_includes/_page_models.jade
+++ b/website/_includes/_page_models.jade
@@ -54,7 +54,7 @@ for id in CURRENT_MODELS
                 +cell
                     .o-field.u-float-left
                         select.o-field__select.u-text-small(data-tpl=id data-tpl-key="compat")
-                    .o-empty(data-tpl=id data-tpl-key="compat-versions") &nbsp;
+                    div(data-tpl=id data-tpl-key="compat-versions") &nbsp;
 
         section(data-tpl=id data-tpl-key="benchmarks" style="display: none")
             +grid.o-block-small
diff --git a/website/assets/css/_base/_objects.sass b/website/assets/css/_base/_objects.sass
index 23dc14744..4e63a4346 100644
--- a/website/assets/css/_base/_objects.sass
+++ b/website/assets/css/_base/_objects.sass
@@ -163,11 +163,4 @@
      height: 1.4em
      border: none
      text-align-last: center
-
-.o-empty:empty:before
-    @include size(1em)
-    border-radius: 50%
-    content: ""
-    display: inline-block
-    background: $color-red
-    vertical-align: middle
+     width: 100%

From fb2710211bbfbbdae1bc358184f2b6e190d225cf Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 14:08:26 +0100
Subject: [PATCH 20/90] Integrate rollup into website build process

---
 website/package.json | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/website/package.json b/website/package.json
index c86aca222..bd2e1c62f 100644
--- a/website/package.json
+++ b/website/package.json
@@ -8,13 +8,15 @@
   "devDependencies": {
     "babel-cli": "^6.14.0",
     "harp": "^0.24.0",
+    "rollup": "^0.50.0",
     "uglify-js": "^2.7.3"
   },
   "dependencies": {},
   "scripts": {
     "compile": "NODE_ENV=deploy harp compile",
-    "compile_js": "babel www/assets/js/main.js --out-file www/assets/js/main.js --presets=es2015",
-    "uglify": "uglifyjs www/assets/js/main.js --output www/assets/js/main.js",
-    "build": "npm run compile && npm run compile_js && npm run uglify"
+    "rollup_js": "rollup www/assets/js/rollup.js --output.format iife --output.file www/assets/js/rollup.js",
+    "compile_rollup": "babel www/assets/js/rollup.js --out-file www/assets/js/rollup.js --presets=es2015",
+    "uglify": "uglifyjs www/assets/js/rollup.js --output www/assets/js/rollup.js",
+    "build": "npm run compile && echo 'Compiled website' && npm run rollup_js && echo 'Bundled rollup.js' && npm run compile_rollup && echo 'Compiled rollup.js' && npm run uglify && echo 'Uglified rollup.js'"
   }
 }

From 1eb1ed0c7c899852385263b3cb2e084f58012ccc Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 14:09:43 +0100
Subject: [PATCH 21/90] Add tool for model comparison (experimental)

User can select two model and their meta is fetched from GitHub. Features, accuracy figures and speed benchmarks are displayed in a table, with an additional chart comparing the accuracy scores if available. Main use case: demonstrating and visualising trade-offs between larger and smaller models of the same type.
---
 website/_includes/_scripts.jade |   6 ++
 website/assets/js/models.js     | 149 ++++++++++++++++++++++++++++++++
 website/models/_data.json       |  14 ++-
 website/models/comparison.jade  |  81 +++++++++++++++++
 4 files changed, 249 insertions(+), 1 deletion(-)
 create mode 100644 website/models/comparison.jade

diff --git a/website/_includes/_scripts.jade b/website/_includes/_scripts.jade
index e1d9f773a..05a468076 100644
--- a/website/_includes/_scripts.jade
+++ b/website/_includes/_scripts.jade
@@ -46,6 +46,7 @@ if IS_PAGE
 - NavHighlighter = "new NavHighlighter('data-section', 'data-nav');"
 - GitHubEmbed = "new GitHubEmbed('" + SOCIAL.github + "', 'data-gh-embed');"
 - ModelLoader = "new ModelLoader('" + MODELS_REPO + "'," + JSON.stringify(CURRENT_MODELS) + "," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + ");"
+- ModelComparer = "new ModelComparer('" + MODELS_REPO + "'," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + "," + JSON.stringify(LANGUAGES) + "," + JSON.stringify(MODEL_META) + "," + JSON.stringify(default_models || false) + ");"
 
 //- Browsers with JS module support.
     Will be ignored otherwise.
@@ -64,6 +65,9 @@ script(type="module")
     if HAS_MODELS
         | import { ModelLoader } from '/assets/js/models.js';
         !=ModelLoader
+    if compare_models
+        | import { ModelComparer } from '/assets/js/models.js';
+        !=ModelComparer
 
 //- Browsers with no JS module support.
     Won't be fetched or interpreted otherwise.
@@ -78,3 +82,5 @@ script(nomodule)
         !=GitHubEmbed
     if HAS_MODELS
         !=ModeLoader
+    if compare_models
+        !=ModelComparer
diff --git a/website/assets/js/models.js b/website/assets/js/models.js
index 5fe7ff54a..7dafdd4e7 100644
--- a/website/assets/js/models.js
+++ b/website/assets/js/models.js
@@ -158,3 +158,152 @@ export class ModelLoader {
     }
 }
 
+export class ModelComparer {
+    /**
+     * Compare to model meta files and render chart and comparison table.
+     * @param {string} repo - Path tp GitHub repository containing releases.
+     * @param {Object} licenses - License IDs mapped to URLs.
+     * @param {Object} benchmarkKeys - Objects of available keys by type, e.g.
+     *                                 'parser', 'ner', 'speed', mapped to labels.
+     * @param {Object} languages - Available languages, ID mapped to name.
+     * @param {Object} defaultModels - Models to compare on load, 'model1' and
+     *                                 'model2' mapped to model names.
+     */
+    constructor(repo, licenses = {}, benchmarkKeys = {}, languages = {}, labels = {}, defaultModels) {
+        this.url = `https://raw.githubusercontent.com/${repo}/master`;
+        this.repo = `https://github.com/${repo}`;
+        this.tpl = new Templater('compare');
+        this.benchKeys = benchmarkKeys;
+        this.licenses = licenses;
+        this.languages = languages;
+        this.labels = labels;
+        this.models = {};
+        this.colors = CHART_COLORS;
+        this.defaultModels = defaultModels;
+        this.fetchCompat()
+            .then(compat => this.init(compat))
+            .catch(this.showError.bind(this))
+    }
+
+    init(compat) {
+        this.compat = compat;
+        const selectA = this.tpl.get('model1');
+        const selectB = this.tpl.get('model2');
+        selectA.addEventListener('change', this.onSelect.bind(this));
+        selectB.addEventListener('change', this.onSelect.bind(this));
+        this.chart = new Chart('chart_compare_accuracy', { type: 'bar',
+            options: { responsive: true, scales: {
+                yAxes: [{ label: 'Accuracy', ticks: { min: 70 }}],
+                xAxes: [{ barPercentage: 0.75 }]
+            }}
+        });
+        if (this.defaultModels) {
+            selectA.value = this.defaultModels.model1;
+            selectB.value = this.defaultModels.model2;
+            this.getModels(this.defaultModels);
+        }
+    }
+
+    fetchCompat() {
+        return new Promise((resolve, reject) =>
+            fetch(`${this.url}/compatibility.json`)
+                .then(res => handleResponse(res))
+                .then(json => json.ok ? resolve(json.spacy) : reject()))
+    }
+
+    fetchModel(name) {
+        const version = getLatestVersion(name, this.compat);
+        const modelName = `${name}-${version}`;
+        return new Promise((resolve, reject) => {
+            // resolve immediately if model already loaded, e.g. in this.models
+            if (this.models[name]) resolve(this.models[name]);
+            else fetch(`${this.url}/meta/${modelName}.json`)
+                .then(res => handleResponse(res))
+                .then(json => json.ok ? resolve(this.saveModel(name, json)) : reject())
+        })
+    }
+
+    /**
+     * "Save" meta to this.models so it only has to be fetched from GitHub once.
+     * @param {string} name - The model name.
+     * @param {Object} data - The model meta data.
+     */
+    saveModel(name, data) {
+        this.models[name] = data;
+        return data;
+    }
+
+    showError() {
+        this.tpl.get('result').style.display = 'none';
+        this.tpl.get('error').style.display = 'block';
+    }
+
+    onSelect(ev) {
+        const modelId = ev.target.value;
+        const otherId = (ev.target.id == 'model1') ? 'model2' : 'model1';
+        const otherVal = this.tpl.get(otherId);
+        const otherModel = otherVal.options[otherVal.selectedIndex].value;
+        if (otherModel != '') this.getModels({
+            [ev.target.id]: modelId,
+            [otherId]: otherModel
+        })
+    }
+
+    getModels({ model1, model2 }) {
+        this.tpl.get('result').setAttribute('data-loading', '');
+        this.fetchModel(model1)
+            .then(data1 => this.fetchModel(model2)
+                .then(data2 => this.render({ model1: data1, model2: data2 })))
+                .catch(this.showError.bind(this))
+    }
+
+    /**
+     * Render two models, and populate the chart and table. Currently quite hacky :(
+     * @param {Object} models - The models to render.
+     * @param {Object} models.model1 - The first model (via first <select>).
+     * @param {Object} models.model2 - The second model (via second <select>).
+     */
+    render({ model1, model2 }) {
+        const accKeys = Object.assign({}, this.benchKeys.parser, this.benchKeys.ner);
+        const allKeys = [...Object.keys(model1.accuracy || []), ...Object.keys(model2.accuracy || [])];
+        const metaKeys = Object.keys(accKeys).filter(k => allKeys.includes(k));
+        const labels = metaKeys.map(key => accKeys[key]);
+        const datasets = [model1, model2]
+            .map(({ lang, name, version, accuracy = {} }, i) => ({
+                label: `${lang}_${name}-${version}`,
+                backgroundColor: this.colors[`model${i + 1}`],
+                data: metaKeys.map(key => (accuracy[key] || 0).toFixed(2))
+            }));
+        this.chart.data = { labels, datasets };
+        this.chart.update();
+        [model1, model2].forEach((model, i) => this.renderTable(metaKeys, i + 1, model));
+        this.tpl.get('result').removeAttribute('data-loading');
+    }
+
+    renderTable(metaKeys, i, { lang, name, version, size, description,
+        notes, author, url, license, sources, vectors, pipeline, accuracy = {},
+        speed = {}}) {
+        const type = name.split('_')[0];  // extract type from model name
+        const genre = name.split('_')[1];  // extract genre from model name
+        this.tpl.fill(`table-head${i}`, `${lang}_${name}`);
+        this.tpl.get(`link${i}`).setAttribute('href', `/models/${lang}#${lang}_${name}`);
+        this.tpl.fill(`download${i}`, `spacy download ${lang}_${name}\n`);
+        this.tpl.fill(`lang${i}`, this.languages[lang] || lang);
+        this.tpl.fill(`type${i}`, this.labels[type] || type);
+        this.tpl.fill(`genre${i}`, this.labels[genre] || genre);
+        this.tpl.fill(`version${i}`, formats.version(version), true);
+        this.tpl.fill(`size${i}`, size);
+        this.tpl.fill(`desc${i}`, description || 'n/a');
+        this.tpl.fill(`pipeline${i}`, formats.pipeline(pipeline), true);
+        this.tpl.fill(`vectors${i}`, formats.vectors(vectors));
+        this.tpl.fill(`sources${i}`, formats.sources(sources));
+        this.tpl.fill(`author${i}`, formats.author(author, url), true);
+        this.tpl.fill(`license${i}`, formats.license(license, this.licenses[license]), true);
+        // check if model accuracy or speed includes one of the pre-set keys
+        for (let key of [...metaKeys, ...Object.keys(this.benchKeys.speed)]) {
+            if (accuracy[key]) this.tpl.fill(`${key}${i}`, accuracy[key].toFixed(2))
+            else if (speed[key]) this.tpl.fill(`${key}${i}`, convertNumber(Math.round(speed[key])))
+            else this.tpl.fill(`${key}${i}`, 'n/a')
+        }
+    }
+}
diff --git a/website/models/_data.json b/website/models/_data.json
index d41d45e8e..959d73133 100644
--- a/website/models/_data.json
+++ b/website/models/_data.json
@@ -1,7 +1,8 @@
 {
     "sidebar": {
         "Models": {
-            "Overview": "./"
+            "Overview": "./",
+            "Comparison": "comparison"
         },
 
         "Language models": {
@@ -26,6 +27,17 @@
         }
     },
 
+    "comparison": {
+        "title": "Model Comparison",
+        "teaser": "Compare spaCy's statistical models and their accuracy.",
+        "tag": "experimental",
+        "compare_models": true,
+        "default_models": {
+            "model1": "en_core_web_sm",
+            "model2": "en_core_web_lg"
+        }
+    },
+
     "MODELS": {
         "en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"],
         "de": ["de_dep_news_sm"],
diff --git a/website/models/comparison.jade b/website/models/comparison.jade
new file mode 100644
index 000000000..185c4362e
--- /dev/null
+++ b/website/models/comparison.jade
@@ -0,0 +1,81 @@
+//- 💫 DOCS > MODELS > COMPARISON
+
+include ../_includes/_mixins
+
+p
+    |  This experimental tool helps you compare spaCy's statistical models
+    |  by features, accuracy and speed. This can be especially useful to get an
+    |  idea of the trade-offs between larger and smaller models of the same
+    |  type. For example, #[code lg] models tend to be more accurate than
+    |  the corresponding #[code sm] versions – but they're often significantly
+    |  larger in file size and memory usage.
+
+- TPL = "compare"
+
++grid.o-box
+    for i in [1, 2]
+        +grid-col("half", "no-gutter")
+            label.u-heading.u-text-label.u-text-center.u-color-theme(for="model#{i}") Model #{i}
+            .o-field.o-grid.o-grid--vcenter.u-padding-small
+                select.o-field__select.u-text-small(id="model#{i}" data-tpl=TPL data-tpl-key="model#{i}")
+                    option(selected="" disabled="" value="") Select model...
+                    for models, _ in MODELS
+                        for model in models
+                            option(value=model)=model
+
+div(data-tpl=TPL data-tpl-key="error" style="display: none")
+    +infobox
+        |  Unable to load model details and accuracy figures from GitHub to
+        |  compare the models. For details of the individual models, see the
+        |  overview of the
+        |  #[+a(gh("spacy-models") + "/releases") latest model releases].
+
+div(data-tpl=TPL data-tpl-key="result")
+    +chart("compare_accuracy", 350)
+
+    +aside-code("Download", "text")(style="display: none")
+        for i in [1, 2]
+            span(data-tpl=TPL data-tpl-key="download#{i}")
+
+    +table.o-block-small(data-tpl=TPL data-tpl-key="table")
+        +row("head")
+            +head-cell
+            for i in [1, 2]
+                +head-cell(style="width: 40%")
+                    a(data-tpl=TPL data-tpl-key="link#{i}")
+                        code(data-tpl=TPL data-tpl-key="table-head#{i}" style="text-transform: initial; font-weight: normal")
+
+        for label, id in {lang: "Language", type: "Type", genre: "Genre"}
+            +row
+                +cell #[+label=label]
+                for i in [1, 2]
+                    +cell(data-tpl=TPL data-tpl-key="#{id}#{i}") n/a
+
+        for label in ["Version", "Size", "Pipeline", "Vectors", "Sources", "Author", "License"]
+            - var field = label.toLowerCase()
+            +row
+                +cell.u-nowrap
+                    +label=label
+                        if MODEL_META[field]
+                            |  #[+help(MODEL_META[field]).u-color-subtle]
+                for i in [1, 2]
+                    +cell
+                        span(data-tpl=TPL data-tpl-key=field + i) #[em n/a]
+
+        +row
+            +cell #[+label Description]
+            for i in [1, 2]
+                +cell.u-text-tiny(data-tpl=TPL data-tpl-key="desc#{i}") n/a
+
+        for benchmark, _ in MODEL_BENCHMARKS
+            - var counter = 0
+            for label, field in benchmark
+                +row((counter == 0) ? "divider" : null)
+                    +cell.u-nowrap
+                        +label=label
+                            if MODEL_META[field]
+                                |  #[+help(MODEL_META[field]).u-color-subtle]
+                    for i in [1, 2]
+                        +cell
+                            span(data-tpl=TPL data-tpl-key=field + i) n/a
+                - counter++

From 14ad92d3374a161f2947dbc2049dadad33169c20 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 16:16:19 +0100
Subject: [PATCH 22/90] Ensure fallbacks / progressive enhancement if JS
 disabled

---
 website/_includes/_mixins.jade       |  7 ++++++-
 website/_includes/_page_models.jade  |  2 +-
 website/_includes/_sidebar.jade      |  2 +-
 website/assets/js/github-embed.js    | 16 +++++++++++-----
 website/assets/js/models.js          |  2 ++
 website/assets/js/nav-highlighter.js |  2 ++
 website/models/comparison.jade       |  6 +++---
 7 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade
index 94d84b4fe..615160023 100644
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@@ -281,7 +281,12 @@ mixin github(repo, file, height, alt_file, language)
 
     figure.o-block
         pre.c-code-block.o-block-small(class="lang-#{(language || DEFAULT_SYNTAX)}" style="height: #{height}px; min-height: #{height}px")
-            code.c-code-block__content(data-gh-embed="#{repo}/#{branch}/#{file}")
+            code.c-code-block__content(data-gh-embed="#{repo}/#{branch}/#{file}").
+                Can't fetch code example from GitHub :(
+
+                Please use the link below to view the example. If you've come across
+                a broken link, we always appreciate a pull request to the repository,
+                or a report on the issue tracker. Thanks!
 
         footer.o-grid.u-text
             .o-block-small.u-flex-full.u-padding-small #[+icon("github")] #[code.u-break.u-break--all=repo + '/' + (alt_file || file)]
diff --git a/website/_includes/_page_models.jade b/website/_includes/_page_models.jade
index b220f8d35..1cab930fb 100644
--- a/website/_includes/_page_models.jade
+++ b/website/_includes/_page_models.jade
@@ -20,7 +20,7 @@ for id in CURRENT_MODELS
 
         p(data-tpl=id data-tpl-key="description")
 
-        div(data-tpl=id data-tpl-key="error" style="display: none")
+        div(data-tpl=id data-tpl-key="error")
             +infobox
                 |  Unable to load model details from GitHub. To find out more
                 |  about this model, see the overview of the
diff --git a/website/_includes/_sidebar.jade b/website/_includes/_sidebar.jade
index 1bca2cb80..9b9cd00a3 100644
--- a/website/_includes/_sidebar.jade
+++ b/website/_includes/_sidebar.jade
@@ -19,5 +19,5 @@ menu.c-sidebar.js-sidebar.u-text
                                     - var counter = 0
                                     for id, title in menu
                                         - counter++
-                                        li.c-sidebar__crumb__item(data-nav=id class=(counter == 1) ? "is-active" : null)
+                                        li.c-sidebar__crumb__item(data-nav=id)
                                             +a("#section-" + id)=title
diff --git a/website/assets/js/github-embed.js b/website/assets/js/github-embed.js
index 58e80ee1a..ec72fd713 100644
--- a/website/assets/js/github-embed.js
+++ b/website/assets/js/github-embed.js
@@ -14,7 +14,6 @@ export default class GitHubEmbed {
     constructor(user, attr) {
         this.url = `https://raw.githubusercontent.com/${user}`;
         this.attr = attr;
-        this.error = `\nCan't fetch code example from GitHub :(\n\nPlease use the link below to view the example. If you've come across\na broken link, we always appreciate a pull request to the repository,\nor a report on the issue tracker. Thanks!`;
         [...$$(`[${this.attr}]`)].forEach(el => this.embed(el));
     }
 
@@ -27,10 +26,17 @@ export default class GitHubEmbed {
         el.parentElement.setAttribute('data-loading', '');
         fetch(`${this.url}/${el.getAttribute(this.attr)}`)
             .then(res => res.text().then(text => ({ text, ok: res.ok })))
-            .then(({ text, ok }) => {
-                el.textContent = ok ? text : this.error;
-                if (ok && window.Prism) Prism.highlightElement(el);
-            })
+            .then(({ text, ok }) => ok ? this.render(el, text) : false)
         el.parentElement.removeAttribute('data-loading');
     }
+
+    /**
+     * Add text to container and apply syntax highlighting via Prism, if available.
+     * @param {node} el - The element.
+     * @param {string} text - The raw code, fetched from GitHub.
+     */
+    render(el, text) {
+        el.textContent = text;
+        if (window.Prism) Prism.highlightElement(el);
+    }
 }
diff --git a/website/assets/js/models.js b/website/assets/js/models.js
index 7dafdd4e7..10ef87642 100644
--- a/website/assets/js/models.js
+++ b/website/assets/js/models.js
@@ -100,6 +100,7 @@ export class ModelLoader {
         const modelId = `${data.lang}_${data.name}`;
         const model = `${modelId}-${data.version}`;
         const tpl = new Templater(modelId);
+        tpl.get('error').style.display = 'none';
         this.renderDetails(tpl, data)
         this.renderBenchmarks(tpl, data.accuracy, data.speed);
         this.renderCompat(tpl, modelId);
@@ -180,6 +181,7 @@ export class ModelComparer {
         this.models = {};
         this.colors = CHART_COLORS;
         this.defaultModels = defaultModels;
+        this.tpl.get('result').style.display = 'block';
         this.fetchCompat()
             .then(compat => this.init(compat))
             .catch(this.showError.bind(this))
diff --git a/website/assets/js/nav-highlighter.js b/website/assets/js/nav-highlighter.js
index 40f708e5e..a7bb227d5 100644
--- a/website/assets/js/nav-highlighter.js
+++ b/website/assets/js/nav-highlighter.js
@@ -11,6 +11,8 @@ export default class NavHighlighter {
      */
     constructor(sectionAttr, navAttr, activeClass = 'is-active') {
         this.sections = [...$$(`[${navAttr}]`)];
+        // highlight first item regardless
+        if (this.sections.length) this.sections[0].classList.add(activeClass);
         this.navAttr = navAttr;
         this.sectionAttr = sectionAttr;
         this.activeClass = activeClass;
diff --git a/website/models/comparison.jade b/website/models/comparison.jade
index 185c4362e..881a9aff4 100644
--- a/website/models/comparison.jade
+++ b/website/models/comparison.jade
@@ -23,17 +23,17 @@ p
                         for model in models
                             option(value=model)=model
 
-div(data-tpl=TPL data-tpl-key="error" style="display: none")
+div(data-tpl=TPL data-tpl-key="error")
     +infobox
         |  Unable to load model details and accuracy figures from GitHub to
         |  compare the models. For details of the individual models, see the
         |  overview of the
         |  #[+a(gh("spacy-models") + "/releases") latest model releases].
 
-div(data-tpl=TPL data-tpl-key="result")
+div(data-tpl=TPL data-tpl-key="result" style="display: none")
     +chart("compare_accuracy", 350)
 
-    +aside-code("Download", "text")(style="display: none")
+    +aside-code("Download", "text")
         for i in [1, 2]
             span(data-tpl=TPL data-tpl-key="download#{i}")
 

From 3ffbb64ab6b0340d62cd1624402c934f6d54e6e7 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 17:25:49 +0100
Subject: [PATCH 23/90] Unify chart options and update styleguide

---
 website/assets/js/models.js | 21 +++++++++++++--------
 website/styleguide.jade     | 23 ++++++++++-------------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/website/assets/js/models.js b/website/assets/js/models.js
index 10ef87642..e79073edd 100644
--- a/website/assets/js/models.js
+++ b/website/assets/js/models.js
@@ -5,9 +5,11 @@ import { Templater, handleResponse, convertNumber } from './util.js';
 /**
  * Chart.js defaults
  */
-Chart.defaults.global.legend.position = 'bottom';
-Chart.defaults.global.defaultFontFamily = "-apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'";
 const CHART_COLORS = { model1: '#09a3d5', model2: '#066B8C' };
+const CHART_FONTS = {
+    legend: '-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol"',
+    ticks: 'Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace'
+};
 
 /**
  * Formatters for model details.
@@ -180,6 +182,7 @@ export class ModelComparer {
         this.labels = labels;
         this.models = {};
         this.colors = CHART_COLORS;
+        this.fonts = CHART_FONTS;
         this.defaultModels = defaultModels;
         this.tpl.get('result').style.display = 'block';
         this.fetchCompat()
@@ -193,12 +196,14 @@ export class ModelComparer {
         const selectB = this.tpl.get('model2');
         selectA.addEventListener('change', this.onSelect.bind(this));
         selectB.addEventListener('change', this.onSelect.bind(this));
-        this.chart = new Chart('chart_compare_accuracy', { type: 'bar',
-            options: { responsive: true, scales: {
-                yAxes: [{ label: 'Accuracy', ticks: { min: 70 }}],
-                xAxes: [{ barPercentage: 0.75 }]
-            }}
-        });
+        this.chart = new Chart('chart_compare_accuracy', { type: 'bar', options: {
+            responsive: true,
+            legend: { position: 'bottom', labels: { fontFamily: this.fonts.legend, fontSize: 13 }},
+            scales: {
+                yAxes: [{ label: 'Accuracy', ticks: { min: 70, fontFamily: this.fonts.ticks }}],
+                xAxes: [{ barPercentage: 0.75, ticks: { fontFamily: this.fonts.ticks }}]
+            }
+        }});
         if (this.defaultModels) {
             selectA.value = this.defaultModels.model1;
             selectB.value = this.defaultModels.model2;
diff --git a/website/styleguide.jade b/website/styleguide.jade
index b503569b7..8034615e1 100644
--- a/website/styleguide.jade
+++ b/website/styleguide.jade
@@ -130,10 +130,11 @@ include _includes/_mixins
         |  capabilities and can be used to mark features that require a
         |  respective model to be installed.
 
-    p.o-block.o-inline-list
-        +tag I'm a tag
-        +tag-new(2)
-        +tag-model("Named entities")
+    .o-block
+        p.o-inline-list
+            +tag I'm a tag
+            +tag-new(2)
+            +tag-model("Named entities")
 
     +h(3, "icons", "website/_includes/_svg.jade") Icons
 
@@ -359,18 +360,14 @@ include _includes/_mixins
         script(src="/assets/js/chart.min.js")
         script new Chart('chart_accuracy', { datasets: [] })
 
-    +grid
-        +grid-col("half")
-            +chart("accuracy", 400)
+    +chart("accuracy", 400)
+    +chart("speed", 300)
 
-        +grid-col("half")
-            +chart("speed", 300)
-
-    script(src="/assets/js/chart.min.js")
+    script(src="/assets/js/vendor/chart.min.js")
     script.
         Chart.defaults.global.defaultFontFamily = "-apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'";
-        new Chart('chart_accuracy', { type: 'bar', options: { legend: false, responsive: true, scales: { yAxes: [{ label: 'Accuracy', ticks: { suggestedMin: 70 } }], xAxes: [{ barPercentage: 0.425 }]}}, data: { labels: ['UAS', 'LAS', 'POS', 'NER F', 'NER P', 'NER R'], datasets: [{ label: 'en_core_web_sm', data: [91.49, 89.66, 97.23, 86.46, 86.78, 86.15], backgroundColor: '#09a3d5' }]}});
-        new Chart('chart_speed', { type: 'horizontalBar', options: { legend: false, responsive: true, scales: { xAxes: [{ label: 'Speed', ticks: { suggestedMin: 0 }}], yAxes: [{ barPercentage: 0.425 }]}}, data: { labels: ['w/s CPU', 'w/s GPU'], datasets: [{ label: 'en_core_web_sm', data: [9575, 25531], backgroundColor: '#09a3d5'}]}});
+        new Chart('chart_accuracy', { type: 'bar', options: { legend: { position: 'bottom'}, responsive: true, scales: { yAxes: [{ label: 'Accuracy', ticks: { suggestedMin: 70 } }], xAxes: [{ barPercentage: 0.75 }]}}, data: { labels: ['UAS', 'LAS', 'POS', 'NER F', 'NER P', 'NER R'], datasets: [{ label: 'en_core_web_sm', data: [91.65, 89.77, 97.05, 84.80, 84.53, 85.06], backgroundColor: '#09a3d5' }, { label: 'en_core_web_lg', data: [91.49, 89.66, 97.23, 86.46, 86.78, 86.15], backgroundColor: '#066B8C'}]}});
+        new Chart('chart_speed', { type: 'horizontalBar', options: { legend: { position: 'bottom'}, responsive: true, scales: { xAxes: [{ label: 'Speed', ticks: { suggestedMin: 0 }}], yAxes: [{ barPercentage: 0.75 }]}}, data: { labels: ['w/s CPU', 'w/s GPU'], datasets: [{ label: 'en_core_web_sm', data: [9575, 25531], backgroundColor: '#09a3d5'}, { label: 'en_core_web_lg', data: [8421, 22092], backgroundColor: '#066B8C'}]}});
 
 +section("embeds")
     +h(2, "embeds") Embeds

From 98c35d2585c548e6ff2c25a537cfd81c25482283 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 18:38:41 +0100
Subject: [PATCH 24/90] Fix spacy vocab command

---
 spacy/__main__.py  |  2 +-
 spacy/cli/vocab.py | 38 ++++++++++++++++++++++----------------
 2 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/spacy/__main__.py b/spacy/__main__.py
index 770ce5296..f4b5e6715 100644
--- a/spacy/__main__.py
+++ b/spacy/__main__.py
@@ -19,7 +19,7 @@ if __name__ == '__main__':
         'convert': convert,
         'package': package,
         'model': model,
-        'model': vocab,
+        'vocab': vocab,
         'profile': profile,
         'validate': validate
     }
diff --git a/spacy/cli/vocab.py b/spacy/cli/vocab.py
index c1bab825c..d05eff3f0 100644
--- a/spacy/cli/vocab.py
+++ b/spacy/cli/vocab.py
@@ -1,31 +1,33 @@
-'''Compile a vocabulary from a lexicon jsonl file and word vectors.'''
 # coding: utf8
 from __future__ import unicode_literals
 
-from pathlib import Path
 import plac
 import json
 import spacy
 import numpy
-from spacy.util import ensure_path
+from pathlib import Path
+
+from ..util import prints, ensure_path
 
 
 @plac.annotations(
     lang=("model language", "positional", None, str),
-    output_dir=("output directory to store model in", "positional", None, str),
+    output_dir=("model output directory", "positional", None, Path),
     lexemes_loc=("location of JSONL-formatted lexical data", "positional",
-                None, str),
-    vectors_loc=("location of vectors data, as numpy .npz (optional)",
-              "positional", None, str),
-    version=("Model version", "option", "V", str),
-)
-def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, version=None):
-    out_dir = ensure_path(output_dir)
-    jsonl_loc = ensure_path(lexemes_loc)
+                 None, Path),
+    vectors_loc=("optional: location of vectors data, as numpy .npz",
+                 "positional", None, str))
+def make_vocab(cmd, lang, output_dir, lexemes_loc, vectors_loc=None):
+    """Compile a vocabulary from a lexicon jsonl file and word vectors."""
+    if not lexemes_loc.exists():
+        prints(lexemes_loc, title="Can't find lexical data", exits=1)
+    vectors_loc = ensure_path(vectors_loc)
     nlp = spacy.blank(lang)
     for word in nlp.vocab:
         word.rank = 0
-    with jsonl_loc.open() as file_:
+    lex_added = 0
+    vec_added = 0
+    with lexemes_loc.open() as file_:
         for line in file_:
             if line.strip():
                 attrs = json.loads(line)
@@ -35,14 +37,18 @@ def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, version=None):
                     lex = nlp.vocab[attrs['orth']]
                     lex.set_attrs(**attrs)
                     assert lex.rank == attrs['id']
+                lex_added += 1
     if vectors_loc is not None:
         vector_data = numpy.load(open(vectors_loc, 'rb'))
         nlp.vocab.clear_vectors(width=vector_data.shape[1])
-        added = 0
         for word in nlp.vocab:
             if word.rank:
                 nlp.vocab.vectors.add(word.orth_, row=word.rank,
                                       vector=vector_data[word.rank])
-                added += 1
-    nlp.to_disk(out_dir)
+                vec_added += 1
+    if not output_dir.exists():
+        output_dir.mkdir()
+    nlp.to_disk(output_dir)
+    prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
+           title="Sucessfully compiled vocab and vectors, and saved model")
     return nlp

From ce98fa79341806d5ef87c764350013f2b3722ef9 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 18:38:55 +0100
Subject: [PATCH 25/90] Fix formatting

---
 spacy/cli/evaluate.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index d4d54d8aa..d7695fd73 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -17,14 +17,14 @@ numpy.random.seed(0)
 
 
 @plac.annotations(
-    model=("Model name or path", "positional", None, str),
-    data_path=("Location of JSON-formatted evaluation data", "positional",
+    model=("model name or path", "positional", None, str),
+    data_path=("location of JSON-formatted evaluation data", "positional",
                None, str),
-    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
-    gpu_id=("Use GPU", "option", "g", int),
-    displacy_path=("Directory to output rendered parses as HTML", "option",
+    gold_preproc=("use gold preprocessing", "flag", "G", bool),
+    gpu_id=("use GPU", "option", "g", int),
+    displacy_path=("directory to output rendered parses as HTML", "option",
                    "dp", str),
-    displacy_limit=("Limit of parses to render as HTML", "option", "dl", int))
+    displacy_limit=("limit of parses to render as HTML", "option", "dl", int))
 def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
              displacy_path=None, displacy_limit=25):
     """

From abf8aa05d39688a69afd6c389ab517263982572e Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 18:39:38 +0100
Subject: [PATCH 26/90] Populate --create-meta defaults from file if available

If meta.json is found in directory and user chooses to overwrite it, show existing data as defaults.
---
 spacy/cli/package.py             | 40 +++++++++++++++++---------------
 website/api/_top-level/_cli.jade | 18 +++++++-------
 2 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index d1984fe65..3157ba99d 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -16,10 +16,11 @@ from .. import about
     input_dir=("directory with model data", "positional", None, str),
     output_dir=("output parent directory", "positional", None, str),
     meta_path=("path to meta.json", "option", "m", str),
-    create_meta=("create meta.json, even if one exists in directory", "flag",
-                 "c", bool),
-    force=("force overwriting of existing folder in output directory", "flag",
-           "f", bool))
+    create_meta=("create meta.json, even if one exists in directory – if "
+                 "existing meta is found, entries are shown as defaults in "
+                 "the command line prompt", "flag", "c", bool),
+    force=("force overwriting of existing model directory in output directory",
+           "flag", "f", bool))
 def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
             force=False):
     """
@@ -41,13 +42,13 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
     template_manifest = get_template('MANIFEST.in')
     template_init = get_template('xx_model_name/__init__.py')
     meta_path = meta_path or input_path / 'meta.json'
-    if not create_meta and meta_path.is_file():
-        prints(meta_path, title="Reading meta.json from file")
+    if meta_path.is_file():
         meta = util.read_json(meta_path)
-    else:
-        meta = generate_meta(input_dir)
+        if not create_meta:  # only print this if user doesn't want to overwrite
+            prints(meta_path, title="Loaded meta.json from file")
+        else:
+            meta = generate_meta(input_dir, meta)
     meta = validate_meta(meta, ['lang', 'name', 'version'])
-
     model_name = meta['lang'] + '_' + meta['name']
     model_name_v = model_name + '-' + meta['version']
     main_path = output_path / model_name_v
@@ -82,18 +83,19 @@ def create_file(file_path, contents):
     file_path.open('w', encoding='utf-8').write(contents)
 
 
-def generate_meta(model_path):
-    meta = {}
-    settings = [('lang', 'Model language', 'en'),
-                ('name', 'Model name', 'model'),
-                ('version', 'Model version', '0.0.0'),
+def generate_meta(model_path, existing_meta):
+    meta = existing_meta or {}
+    settings = [('lang', 'Model language', meta.get('lang', 'en')),
+                ('name', 'Model name', meta.get('name', 'model')),
+                ('version', 'Model version', meta.get('version', '0.0.0')),
                 ('spacy_version', 'Required spaCy version',
                  '>=%s,<3.0.0' % about.__version__),
-                ('description', 'Model description', False),
-                ('author', 'Author', False),
-                ('email', 'Author email', False),
-                ('url', 'Author website', False),
-                ('license', 'License', 'CC BY-NC 3.0')]
+                ('description', 'Model description',
+                  meta.get('description', False)),
+                ('author', 'Author', meta.get('author', False)),
+                ('email', 'Author email', meta.get('email', False)),
+                ('url', 'Author website', meta.get('url', False)),
+                ('license', 'License', meta.get('license', 'CC BY-SA 3.0'))]
     nlp = util.load_model_from_path(Path(model_path))
     meta['pipeline'] = nlp.pipe_names
     meta['vectors'] = {'width': nlp.vocab.vectors_length,
diff --git a/website/api/_top-level/_cli.jade b/website/api/_top-level/_cli.jade
index f19eb43d0..aa13abc12 100644
--- a/website/api/_top-level/_cli.jade
+++ b/website/api/_top-level/_cli.jade
@@ -453,10 +453,11 @@ p
 p
     |  Generate a #[+a("/usage/training#models-generating") model Python package]
     |  from an existing model data directory. All data files are copied over.
-    |  If the path to a meta.json is supplied, or a meta.json is found in the
-    |  input directory, this file is used. Otherwise, the data can be entered
-    |  directly from the command line. The required file templates are downloaded
-    |  from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make
+    |  If the path to a #[code meta.json] is supplied, or a #[code meta.json] is
+    |  found in the input directory, this file is used. Otherwise, the data can
+    |  be entered directly from the command line. The required file templates
+    |  are downloaded from
+    |  #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make
     |  sure you're always using the latest versions. This means you need to be
     |  connected to the internet to use this command.
 
@@ -477,15 +478,16 @@ p
     +row
         +cell #[code --meta-path], #[code -m]
         +cell option
-        +cell #[+tag-new(2)] Path to meta.json file (optional).
+        +cell #[+tag-new(2)] Path to #[code meta.json] file (optional).
 
     +row
         +cell #[code --create-meta], #[code -c]
         +cell flag
         +cell
-            |  #[+tag-new(2)] Create a meta.json file on the command line, even
-            |  if one already exists in the directory.
-
+            |  #[+tag-new(2)] Create a #[code meta.json] file on the command
+            |  line, even if one already exists in the directory. If an
+            |  existing file is found, its entries will be shown as the defaults
+            |  in the command line prompt.
     +row
         +cell #[code --force], #[code -f]
         +cell flag

From 8e022942413f65a7b28ea45fa92ba687db76d1f9 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 18:39:48 +0100
Subject: [PATCH 27/90] Add vectors to Language.meta

---
 spacy/language.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/language.py b/spacy/language.py
index 05546cde4..1ce74b265 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -154,6 +154,8 @@ class Language(object):
         self._meta.setdefault('email', '')
         self._meta.setdefault('url', '')
         self._meta.setdefault('license', '')
+        self._meta['vectors'] = {'width': self.vocab.vectors_length,
+                                 'entries': len(self.vocab.vectors)}
         self._meta['pipeline'] = self.pipe_names
         return self._meta
 

From 559854205506ef896a3effe63206a73012d7d0d0 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 18:58:55 +0100
Subject: [PATCH 28/90] Add link

---
 website/api/_top-level/_spacy.jade | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/website/api/_top-level/_spacy.jade b/website/api/_top-level/_spacy.jade
index 81612c5e6..c6b342011 100644
--- a/website/api/_top-level/_spacy.jade
+++ b/website/api/_top-level/_spacy.jade
@@ -85,7 +85,9 @@ p
     +row
         +cell #[code name]
         +cell unicode
-        +cell ISO code of the language class to load.
+        +cell
+            |  #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code]
+            |  of the language class to load.
 
     +row
         +cell #[code disable]

From 12343e23fda04f1a607e3dbd67a1bae45275f09e Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 18:59:08 +0100
Subject: [PATCH 29/90] Update CLI docs and document vocab command

---
 website/api/_top-level/_cli.jade | 96 +++++++++++++++++++++++++++++++-
 1 file changed, 95 insertions(+), 1 deletion(-)

diff --git a/website/api/_top-level/_cli.jade b/website/api/_top-level/_cli.jade
index aa13abc12..ec2c1737a 100644
--- a/website/api/_top-level/_cli.jade
+++ b/website/api/_top-level/_cli.jade
@@ -34,6 +34,13 @@ p
         +cell flag
         +cell Show help message and available arguments.
 
+    +row("foot")
+        +cell creates
+        +cell directory, symlink
+        +cell
+            |  The installed model package in your #[code site-packages]
+            |  directory and a shortcut link as a symlink in #[code spacy/data].
+
 +aside("Downloading best practices")
     |  The #[code download] command is mostly intended as a convenient,
     |  interactive wrapper – it performs compatibility checks and prints
@@ -86,6 +93,13 @@ p
         +cell flag
         +cell Show help message and available arguments.
 
+    +row("foot")
+        +cell creates
+        +cell symlink
+        +cell
+            |  A shortcut link of the given name as a symlink in
+            |  #[code spacy/data].
+
 +h(3, "info") Info
 
 p
@@ -113,6 +127,11 @@ p
         +cell flag
         +cell Show help message and available arguments.
 
+    +row("foot")
+        +cell prints
+        +cell #[code stdout]
+        +cell Information about your spaCy installation.
+
 +h(3, "validate") Validate
     +tag-new(2)
 
@@ -129,6 +148,12 @@ p
 +code(false, "bash", "$").
     spacy validate
 
++table(["Argument", "Type", "Description"])
+    +row("foot")
+        +cell prints
+        +cell #[code stdout]
+        +cell Details about the compatibility of your installed models.
+
 +h(3, "convert") Convert
 
 p
@@ -172,6 +197,11 @@ p
         +cell flag
         +cell Show help message and available arguments.
 
+    +row("foot")
+        +cell creates
+        +cell JSON
+        +cell Data in spaCy's #[+a("/api/annotation#json-input") JSON format].
+
 p The following converters are available:
 
 +table(["ID", "Description"])
@@ -286,6 +316,11 @@ p
         +cell flag
         +cell Show help message and available arguments.
 
+    +row("foot")
+        +cell creates
+        +cell model, pickle
+        +cell A spaCy model on each epoch, and a final #[code .pickle] file.
+
 +h(4, "train-hyperparams") Environment variables for hyperparameters
     +tag-new(2)
 
@@ -395,6 +430,47 @@ p
         +cell Gradient L2 norm constraint.
         +cell #[code 1.0]
 
++h(3, "vocab") Vocab
+    +tag-new(2)
+
+p
+    |  Compile a vocabulary from a #[+a("#") lexicon JSONL] file and optional
+    |  word vectors. Will save out a valid spaCy model that you can load via
+    |  #[+api("spacy#load") #[code spacy.load]] or package using the
+    |  #[+api("cli#package") #[code package]] command.
+
++code(false, "bash", "$").
+    spacy vocab [lang] [output_dir] [lexemes_loc] [vectors_loc]
+
++table(["Argument", "Type", "Description"])
+    +row
+        +cell #[code lang]
+        +cell positional
+        +cell
+            |  Model language
+            |  #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code],
+            |  e.g. #[code en].
+
+    +row
+        +cell #[code output_dir]
+        +cell positional
+        +cell Model output directory. Will be created if it doesn't exist.
+
+    +row
+        +cell #[code lexemes_loc]
+        +cell positional
+        +cell Location of lexical data in spaCy's #[+a("#") JSONL format].
+
+    +row
+        +cell #[code vectors_loc]
+        +cell positional
+        +cell Optional location of vectors data as numpy #[code .npz] file.
+
+    +row("foot")
+        +cell creates
+        +cell model
+        +cell A spaCy model containing the vocab and vectors.
+
 +h(3, "evaluate") Evaluate
     +tag-new(2)
 
@@ -447,6 +523,11 @@ p
         +cell flag
         +cell Use gold preprocessing.
 
+    +row("foot")
+        +cell prints / creates
+        +cell #[code stdout], HTML
+        +cell Training results and optional displaCy visualizations.
+
 
 +h(3, "package") Package
 
@@ -459,11 +540,19 @@ p
     |  are downloaded from
     |  #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make
     |  sure you're always using the latest versions. This means you need to be
-    |  connected to the internet to use this command.
+    |  connected to the internet to use this command. After packaging, you
+    |  can run #[code python setup.py sdist] from the newly created directory
+    |  to turn your model into an installable archive file.
 
 +code(false, "bash", "$", false, false, true).
     spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force]
 
++aside-code("Example", "bash").
+    spacy package /input /output
+    cd /output/en_model-0.0.0
+    python setup.py sdist
+    pip install dist/en_model-0.0.0.tar.gz
+
 +table(["Argument", "Type", "Description"])
     +row
         +cell #[code input_dir]
@@ -497,3 +586,8 @@ p
         +cell #[code --help], #[code -h]
         +cell flag
         +cell Show help message and available arguments.
+
+    +row("foot")
+        +cell creates
+        +cell directory
+        +cell A Python package containing the spaCy model.

From ec657c1ddcdae63d2cd12a14a5c3536b44841555 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 19:35:41 +0100
Subject: [PATCH 30/90] Update vocab docs and document Vocab.prune_vectors

---
 spacy/vocab.pyx        | 12 +++++++++-
 website/api/vocab.jade | 51 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index ff6c5b844..23254718f 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -252,7 +252,7 @@ cdef class Vocab:
         """Reduce the current vector table to `nr_row` unique entries. Words
         mapped to the discarded vectors will be remapped to the closest vector
         among those remaining.
-        
+
         For example, suppose the original table had vectors for the words:
         ['sat', 'cat', 'feline', 'reclined']. If we prune the vector table to,
         two rows, we would discard the vectors for 'feline' and 'reclined'.
@@ -263,6 +263,15 @@ cdef class Vocab:
         The similarities are judged by cosine. The original vectors may
         be large, so the cosines are calculated in minibatches, to reduce
         memory usage.
+
+        nr_row (int): The number of rows to keep in the vector table.
+        batch_size (int): Batch of vectors for calculating the similarities.
+            Larger batch sizes might be faster, while temporarily requiring
+            more memory.
+        RETURNS (dict): A dictionary keyed by removed words mapped to
+            `(string, score)` tuples, where `string` is the entry the removed
+            word was mapped to, and `score` the similarity score between the
+            two words.
         """
         xp = get_array_module(self.vectors.data)
         # Work in batches, to avoid memory problems.
@@ -285,6 +294,7 @@ cdef class Vocab:
                 self.vectors.add(lex.orth, row=lex.rank)
         # Make copy, to encourage the original table to be garbage collected.
         self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row])
+        # TODO: return new mapping
 
     def get_vector(self, orth):
         """Retrieve a vector for a word in the vocabulary. Words can be looked
diff --git a/website/api/vocab.jade b/website/api/vocab.jade
index 6faefc064..54dd4f691 100644
--- a/website/api/vocab.jade
+++ b/website/api/vocab.jade
@@ -162,7 +162,7 @@ p
         +cell int
         +cell The integer ID by which the flag value can be checked.
 
-+h(2, "add_flag") Vocab.clear_vectors
++h(2, "clear_vectors") Vocab.clear_vectors
     +tag method
     +tag-new(2)
 
@@ -181,7 +181,50 @@ p
             |  Number of dimensions of the new vectors. If #[code None], size
             |  is not changed.
 
-+h(2, "add_flag") Vocab.get_vector
++h(2, "prune_vectors") Vocab.prune_vectors
+    +tag method
+    +tag-new(2)
+
+p
+    |  Reduce the current vector table to #[code nr_row] unique entries. Words
+    |  mapped to the discarded vectors will be remapped to the closest vector
+    |  among those remaining. For example, suppose the original table had
+    |  vectors for the words:
+    |  #[code.u-break ['sat', 'cat', 'feline', 'reclined']]. If we prune the
+    |  vector table to, two rows, we would discard the vectors for "feline"
+    |  and "reclined". These words would then be remapped to the closest
+    |  remaining vector – so "feline" would have the same vector as "cat",
+    |  and "reclined" would have the same vector as "sat". The similarities are
+    |  judged by cosine. The original vectors may be large, so the cosines are
+    |  calculated in minibatches, to reduce memory usage.
+
++aside-code("Example").
+    nlp.vocab.prune_vectors(10000)
+    assert len(nlp.vocab.vectors) &lt;= 1000
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code nr_row]
+        +cell int
+        +cell The number of rows to keep in the vector table.
+
+    +row
+        +cell #[code batch_size]
+        +cell int
+        +cell
+            |  Batch of vectors for calculating the similarities. Larger batch
+            |  sizes might be faster, while temporarily requiring more memory.
+
+    +row("foot")
+        +cell returns
+        +cell dict
+        +cell
+            |  A dictionary keyed by removed words mapped to
+            |  #[code (string, score)] tuples, where #[code string] is the entry
+            |  the removed word was mapped to, and #[code score] the similarity
+            |  score between the two words.
+
++h(2, "get_vector") Vocab.get_vector
     +tag method
     +tag-new(2)
 
@@ -206,7 +249,7 @@ p
             |  A word vector. Size and shape are determined by the
             |  #[code Vocab.vectors] instance.
 
-+h(2, "add_flag") Vocab.set_vector
++h(2, "set_vector") Vocab.set_vector
     +tag method
     +tag-new(2)
 
@@ -228,7 +271,7 @@ p
         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
         +cell The vector to set.
 
-+h(2, "add_flag") Vocab.has_vector
++h(2, "has_vector") Vocab.has_vector
     +tag method
     +tag-new(2)
 

From 57534253e6f8a4de010341d033e66a65ba49ad99 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 19:39:26 +0100
Subject: [PATCH 31/90] Move CLI docs to own page

---
 website/_includes/_functions.jade              |  3 ---
 website/api/_data.json                         | 11 +++++++++--
 website/api/{_top-level/_cli.jade => cli.jade} | 11 ++++++++---
 website/api/top-level.jade                     |  4 ----
 4 files changed, 17 insertions(+), 12 deletions(-)
 rename website/api/{_top-level/_cli.jade => cli.jade} (98%)

diff --git a/website/_includes/_functions.jade b/website/_includes/_functions.jade
index eb16d9659..39139cc58 100644
--- a/website/_includes/_functions.jade
+++ b/website/_includes/_functions.jade
@@ -41,9 +41,6 @@
 -           var comps = path.split('#');
 -           return "top-level#" + comps[0] + '.' + comps[1];
 -       }
--       else if (path.startsWith('cli#')) {
--           return "top-level#" + path.split('#')[1];
--       }
 -       return path;
 -   }
 
diff --git a/website/api/_data.json b/website/api/_data.json
index ba7997690..0be09b782 100644
--- a/website/api/_data.json
+++ b/website/api/_data.json
@@ -3,8 +3,10 @@
         "Overview": {
             "Architecture": "./",
             "Annotation Specs": "annotation",
+            "Command Line": "cli",
             "Functions": "top-level"
         },
+
         "Containers": {
             "Doc": "doc",
             "Token": "token",
@@ -45,14 +47,19 @@
         }
     },
 
+    "cli": {
+        "title": "Command Line Interface",
+        "teaser": "Download, train and package models, and debug spaCy.",
+        "source": "spacy/cli"
+    },
+
     "top-level": {
         "title": "Top-level Functions",
         "menu": {
             "spacy": "spacy",
             "displacy": "displacy",
             "Utility Functions": "util",
-            "Compatibility": "compat",
-            "Command Line": "cli"
+            "Compatibility": "compat"
         }
     },
 
diff --git a/website/api/_top-level/_cli.jade b/website/api/cli.jade
similarity index 98%
rename from website/api/_top-level/_cli.jade
rename to website/api/cli.jade
index ec2c1737a..cd1cb22fb 100644
--- a/website/api/_top-level/_cli.jade
+++ b/website/api/cli.jade
@@ -1,4 +1,6 @@
-//- 💫 DOCS > API > TOP-LEVEL > COMMAND LINE INTERFACE
+//- 💫 DOCS > API > COMMAND LINE INTERFACE
+
+include ../_includes/_mixins
 
 p
     |  As of v1.7.0, spaCy comes with new command line helpers to download and
@@ -434,7 +436,8 @@ p
     +tag-new(2)
 
 p
-    |  Compile a vocabulary from a #[+a("#") lexicon JSONL] file and optional
+    |  Compile a vocabulary from a
+    |  #[+a("/api/annotation#vocab-jsonl") lexicon JSONL] file and optional
     |  word vectors. Will save out a valid spaCy model that you can load via
     |  #[+api("spacy#load") #[code spacy.load]] or package using the
     |  #[+api("cli#package") #[code package]] command.
@@ -459,7 +462,9 @@ p
     +row
         +cell #[code lexemes_loc]
         +cell positional
-        +cell Location of lexical data in spaCy's #[+a("#") JSONL format].
+        +cell
+            |  Location of lexical data in spaCy's
+            |  #[+a("/api/annotation#vocab-jsonl") JSONL format].
 
     +row
         +cell #[code vectors_loc]
diff --git a/website/api/top-level.jade b/website/api/top-level.jade
index 46d2e8750..f16daae23 100644
--- a/website/api/top-level.jade
+++ b/website/api/top-level.jade
@@ -18,7 +18,3 @@ include ../_includes/_mixins
 +section("compat")
     +h(2, "compat", "spacy/compaty.py") Compatibility functions
     include _top-level/_compat
-
-+section("cli", "spacy/cli")
-    +h(2, "cli") Command line
-    include _top-level/_cli

From 18dde7869aff327987e9e318542bc9567c03b3b1 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 19:40:05 +0100
Subject: [PATCH 32/90] Update training data docs and add vocab JSONL

---
 examples/training/vocab-data.jsonl     | 500 +++++++++++++++++++++++++
 website/api/_annotation/_training.jade |  56 +++
 website/api/_data.json                 |   2 +-
 website/api/annotation.jade            |   2 +-
 4 files changed, 558 insertions(+), 2 deletions(-)
 create mode 100644 examples/training/vocab-data.jsonl

diff --git a/examples/training/vocab-data.jsonl b/examples/training/vocab-data.jsonl
new file mode 100644
index 000000000..4fae8fd65
--- /dev/null
+++ b/examples/training/vocab-data.jsonl
@@ -0,0 +1,500 @@
+{"lang": "en", "settings": {"oov_prob": -20.502029418945312}}
+{"orth": ".", "id": 1, "lower": ".", "norm": ".", "shape": ".", "prefix": ".", "suffix": ".", "length": 1, "cluster": "8", "prob": -3.0678977966308594, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": ",", "id": 2, "lower": ",", "norm": ",", "shape": ",", "prefix": ",", "suffix": ",", "length": 1, "cluster": "4", "prob": -3.4549596309661865, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "the", "id": 3, "lower": "the", "norm": "the", "shape": "xxx", "prefix": "t", "suffix": "the", "length": 3, "cluster": "11", "prob": -3.528766632080078, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "I", "id": 4, "lower": "i", "norm": "I", "shape": "X", "prefix": "I", "suffix": "I", "length": 1, "cluster": "346", "prob": -3.791565179824829, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": true, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "to", "id": 5, "lower": "to", "norm": "to", "shape": "xx", "prefix": "t", "suffix": "to", "length": 2, "cluster": "12", "prob": -3.8560216426849365, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "a", "id": 6, "lower": "a", "norm": "a", "shape": "x", "prefix": "a", "suffix": "a", "length": 1, "cluster": "19", "prob": -3.92978835105896, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "and", "id": 7, "lower": "and", "norm": "and", "shape": "xxx", "prefix": "a", "suffix": "and", "length": 3, "cluster": "20", "prob": -4.113108158111572, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "of", "id": 8, "lower": "of", "norm": "of", "shape": "xx", "prefix": "o", "suffix": "of", "length": 2, "cluster": "28", "prob": -4.27587366104126, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "you", "id": 9, "lower": "you", "norm": "you", "shape": "xxx", "prefix": "y", "suffix": "you", "length": 3, "cluster": "602", "prob": -4.373791217803955, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "it", "id": 10, "lower": "it", "norm": "it", "shape": "xx", "prefix": "i", "suffix": "it", "length": 2, "cluster": "474", "prob": -4.388050079345703, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "is", "id": 11, "lower": "is", "norm": "is", "shape": "xx", "prefix": "i", "suffix": "is", "length": 2, "cluster": "762", "prob": -4.457748889923096, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "that", "id": 12, "lower": "that", "norm": "that", "shape": "xxxx", "prefix": "t", "suffix": "hat", "length": 4, "cluster": "84", "prob": -4.464504718780518, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "\n\n", "id": 0, "lower": "\n\n", "norm": "\n\n", "shape": "\n\n", "prefix": "\n", "suffix": "\n\n", "length": 2, "cluster": "0", "prob": -4.606560707092285, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "in", "id": 13, "lower": "in", "norm": "in", "shape": "xx", "prefix": "i", "suffix": "in", "length": 2, "cluster": "60", "prob": -4.619071960449219, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "'s", "id": 14, "lower": "'s", "norm": "'s", "shape": "'x", "prefix": "'", "suffix": "'s", "length": 2, "cluster": "52", "prob": -4.830559253692627, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "n't", "id": 15, "lower": "n't", "norm": "n't", "shape": "x'x", "prefix": "n", "suffix": "n't", "length": 3, "cluster": "74", "prob": -4.859938621520996, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "for", "id": 16, "lower": "for", "norm": "for", "shape": "xxx", "prefix": "f", "suffix": "for", "length": 3, "cluster": "508", "prob": -4.8801093101501465, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "\"", "id": 17, "lower": "\"", "norm": "\"", "shape": "\"", "prefix": "\"", "suffix": "\"", "length": 1, "cluster": "0", "prob": -5.02677583694458, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": true, "is_left_punct": true, "is_right_punct": true}
+{"orth": "?", "id": 18, "lower": "?", "norm": "?", "shape": "?", "prefix": "?", "suffix": "?", "length": 1, "cluster": "0", "prob": -5.05924654006958, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": " ", "id": 0, "lower": " ", "norm": " ", "shape": " ", "prefix": " ", "suffix": " ", "length": 1, "cluster": "0", "prob": -5.129165172576904, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "have", "id": 19, "lower": "have", "norm": "have", "shape": "xxxx", "prefix": "h", "suffix": "ave", "length": 4, "cluster": "378", "prob": -5.156484603881836, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "on", "id": 20, "lower": "on", "norm": "on", "shape": "xx", "prefix": "o", "suffix": "on", "length": 2, "cluster": "2044", "prob": -5.172736167907715, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "*", "id": 21, "lower": "*", "norm": "*", "shape": "*", "prefix": "*", "suffix": "*", "length": 1, "cluster": "5098", "prob": -5.1977410316467285, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": ")", "id": 22, "lower": ")", "norm": ")", "shape": ")", "prefix": ")", "suffix": ")", "length": 1, "cluster": "0", "prob": -5.197994232177734, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": true}
+{"orth": "be", "id": 23, "lower": "be", "norm": "be", "shape": "xx", "prefix": "b", "suffix": "be", "length": 2, "cluster": "458", "prob": -5.225094318389893, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "with", "id": 24, "lower": "with", "norm": "with", "shape": "xxxx", "prefix": "w", "suffix": "ith", "length": 4, "cluster": "1020", "prob": -5.243249893188477, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "do", "id": 25, "lower": "do", "norm": "do", "shape": "xx", "prefix": "d", "suffix": "do", "length": 2, "cluster": "2042", "prob": -5.246996879577637, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "was", "id": 26, "lower": "was", "norm": "was", "shape": "xxx", "prefix": "w", "suffix": "was", "length": 3, "cluster": "250", "prob": -5.252320289611816, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "are", "id": 27, "lower": "are", "norm": "are", "shape": "xxx", "prefix": "a", "suffix": "are", "length": 3, "cluster": "1530", "prob": -5.271068096160889, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "not", "id": 28, "lower": "not", "norm": "not", "shape": "xxx", "prefix": "n", "suffix": "not", "length": 3, "cluster": "1258", "prob": -5.332601070404053, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "but", "id": 29, "lower": "but", "norm": "but", "shape": "xxx", "prefix": "b", "suffix": "but", "length": 3, "cluster": "148", "prob": -5.3419694900512695, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "!", "id": 30, "lower": "!", "norm": "!", "shape": "!", "prefix": "!", "suffix": "!", "length": 1, "cluster": "0", "prob": -5.359641075134277, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "this", "id": 31, "lower": "this", "norm": "this", "shape": "xxxx", "prefix": "t", "suffix": "his", "length": 4, "cluster": "63", "prob": -5.36181640625, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "[", "id": 32, "lower": "[", "norm": "[", "shape": "[", "prefix": "[", "suffix": "[", "length": 1, "cluster": "0", "prob": -5.438112258911133, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": true, "is_right_punct": false}
+{"orth": "-", "id": 33, "lower": "-", "norm": "-", "shape": "-", "prefix": "-", "suffix": "-", "length": 1, "cluster": "36", "prob": -5.468655109405518, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "my", "id": 34, "lower": "my", "norm": "my", "shape": "xx", "prefix": "m", "suffix": "my", "length": 2, "cluster": "251", "prob": -5.491642951965332, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "they", "id": 35, "lower": "they", "norm": "they", "shape": "xxxx", "prefix": "t", "suffix": "hey", "length": 4, "cluster": "90", "prob": -5.5243682861328125, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "as", "id": 36, "lower": "as", "norm": "as", "shape": "xx", "prefix": "a", "suffix": "as", "length": 2, "cluster": "212", "prob": -5.53448486328125, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "like", "id": 37, "lower": "like", "norm": "like", "shape": "xxxx", "prefix": "l", "suffix": "ike", "length": 4, "cluster": "1684", "prob": -5.610429763793945, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "just", "id": 38, "lower": "just", "norm": "just", "shape": "xxxx", "prefix": "j", "suffix": "ust", "length": 4, "cluster": "31978", "prob": -5.630868434906006, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "your", "id": 39, "lower": "your", "norm": "your", "shape": "xxxx", "prefix": "y", "suffix": "our", "length": 4, "cluster": "251", "prob": -5.650108814239502, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "or", "id": 40, "lower": "or", "norm": "or", "shape": "xx", "prefix": "o", "suffix": "or", "length": 2, "cluster": "404", "prob": -5.654984951019287, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "(", "id": 41, "lower": "(", "norm": "(", "shape": "(", "prefix": "(", "suffix": "(", "length": 1, "cluster": "0", "prob": -5.75598669052124, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": true, "is_right_punct": false}
+{"orth": "at", "id": 42, "lower": "at", "norm": "at", "shape": "xx", "prefix": "a", "suffix": "at", "length": 2, "cluster": "124", "prob": -5.763442516326904, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "if", "id": 43, "lower": "if", "norm": "if", "shape": "xx", "prefix": "i", "suffix": "if", "length": 2, "cluster": "4052", "prob": -5.763589859008789, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "would", "id": 44, "lower": "would", "norm": "would", "shape": "xxxx", "prefix": "w", "suffix": "uld", "length": 5, "cluster": "1978", "prob": -5.772674560546875, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "so", "id": 45, "lower": "so", "norm": "so", "shape": "xx", "prefix": "s", "suffix": "so", "length": 2, "cluster": "2282", "prob": -5.823773384094238, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "can", "id": 46, "lower": "can", "norm": "can", "shape": "xxx", "prefix": "c", "suffix": "can", "length": 3, "cluster": "58", "prob": -5.827763080596924, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "me", "id": 47, "lower": "me", "norm": "me", "shape": "xx", "prefix": "m", "suffix": "me", "length": 2, "cluster": "1898", "prob": -5.846089839935303, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "about", "id": 48, "lower": "about", "norm": "about", "shape": "xxxx", "prefix": "a", "suffix": "out", "length": 5, "cluster": "618", "prob": -5.906808853149414, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "he", "id": 49, "lower": "he", "norm": "he", "shape": "xx", "prefix": "h", "suffix": "he", "length": 2, "cluster": "218", "prob": -5.9319047927856445, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "It", "id": 50, "lower": "it", "norm": "It", "shape": "Xx", "prefix": "I", "suffix": "It", "length": 2, "cluster": "894", "prob": -5.93662691116333, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "all", "id": 51, "lower": "all", "norm": "all", "shape": "xxx", "prefix": "a", "suffix": "all", "length": 3, "cluster": "6122", "prob": -5.936640739440918, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "The", "id": 52, "lower": "the", "norm": "The", "shape": "Xxx", "prefix": "T", "suffix": "The", "length": 3, "cluster": "30", "prob": -5.958707332611084, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "get", "id": 53, "lower": "get", "norm": "get", "shape": "xxx", "prefix": "g", "suffix": "get", "length": 3, "cluster": "2570", "prob": -5.992605686187744, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "one", "id": 54, "lower": "one", "norm": "one", "shape": "xxx", "prefix": "o", "suffix": "one", "length": 3, "cluster": "8170", "prob": -5.996385097503662, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "'m", "id": 55, "lower": "'m", "norm": "'m", "shape": "'x", "prefix": "'", "suffix": "'m", "length": 2, "cluster": "3066", "prob": -5.9999823570251465, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "out", "id": 56, "lower": "out", "norm": "out", "shape": "xxx", "prefix": "o", "suffix": "out", "length": 3, "cluster": "1386", "prob": -6.0027008056640625, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "from", "id": 57, "lower": "from", "norm": "from", "shape": "xxxx", "prefix": "f", "suffix": "rom", "length": 4, "cluster": "380", "prob": -6.010132312774658, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "an", "id": 58, "lower": "an", "norm": "an", "shape": "xx", "prefix": "a", "suffix": "an", "length": 2, "cluster": "3", "prob": -6.014852046966553, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "what", "id": 59, "lower": "what", "norm": "what", "shape": "xxxx", "prefix": "w", "suffix": "hat", "length": 4, "cluster": "2026", "prob": -6.023346424102783, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "up", "id": 60, "lower": "up", "norm": "up", "shape": "xx", "prefix": "u", "suffix": "up", "length": 2, "cluster": "362", "prob": -6.028695583343506, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "]", "id": 61, "lower": "]", "norm": "]", "shape": "]", "prefix": "]", "suffix": "]", "length": 1, "cluster": "0", "prob": -6.0386552810668945, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": true}
+{"orth": "\n", "id": 0, "lower": "\n", "norm": "\n", "shape": "\n", "prefix": "\n", "suffix": "\n", "length": 1, "cluster": "0", "prob": -6.0506510734558105, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "people", "id": 62, "lower": "people", "norm": "people", "shape": "xxxx", "prefix": "p", "suffix": "ple", "length": 6, "cluster": "365", "prob": -6.0715765953063965, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "more", "id": 63, "lower": "more", "norm": "more", "shape": "xxxx", "prefix": "m", "suffix": "ore", "length": 4, "cluster": "1514", "prob": -6.081598281860352, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": ":", "id": 64, "lower": ":", "norm": ":", "shape": ":", "prefix": ":", "suffix": ":", "length": 1, "cluster": "228", "prob": -6.128875732421875, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "there", "id": 65, "lower": "there", "norm": "there", "shape": "xxxx", "prefix": "t", "suffix": "ere", "length": 5, "cluster": "986", "prob": -6.135282039642334, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "deleted", "id": 66, "lower": "deleted", "norm": "deleted", "shape": "xxxx", "prefix": "d", "suffix": "ted", "length": 7, "cluster": "1706", "prob": -6.1543049812316895, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "think", "id": 67, "lower": "think", "norm": "think", "shape": "xxxx", "prefix": "t", "suffix": "ink", "length": 5, "cluster": "1674", "prob": -6.180924892425537, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "will", "id": 68, "lower": "will", "norm": "will", "shape": "xxxx", "prefix": "w", "suffix": "ill", "length": 4, "cluster": "442", "prob": -6.199834823608398, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "them", "id": 69, "lower": "them", "norm": "them", "shape": "xxxx", "prefix": "t", "suffix": "hem", "length": 4, "cluster": "5994", "prob": -6.2177276611328125, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "we", "id": 70, "lower": "we", "norm": "we", "shape": "xx", "prefix": "w", "suffix": "we", "length": 2, "cluster": "1626", "prob": -6.230024337768555, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "'re", "id": 71, "lower": "'re", "norm": "'re", "shape": "'xx", "prefix": "'", "suffix": "'re", "length": 3, "cluster": "7162", "prob": -6.255462646484375, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "when", "id": 72, "lower": "when", "norm": "when", "shape": "xxxx", "prefix": "w", "suffix": "hen", "length": 4, "cluster": "16340", "prob": -6.2623114585876465, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "You", "id": 73, "lower": "you", "norm": "You", "shape": "Xxx", "prefix": "Y", "suffix": "You", "length": 3, "cluster": "858", "prob": -6.276494026184082, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "...", "id": 74, "lower": "...", "norm": "...", "shape": "...", "prefix": ".", "suffix": "...", "length": 3, "cluster": "966", "prob": -6.278521537780762, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "some", "id": 75, "lower": "some", "norm": "some", "shape": "xxxx", "prefix": "s", "suffix": "ome", "length": 4, "cluster": "239", "prob": -6.318882465362549, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "has", "id": 76, "lower": "has", "norm": "has", "shape": "xxx", "prefix": "h", "suffix": "has", "length": 3, "cluster": "890", "prob": -6.325605392456055, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "because", "id": 77, "lower": "because", "norm": "because", "shape": "xxxx", "prefix": "b", "suffix": "use", "length": 7, "cluster": "980", "prob": -6.349620342254639, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "know", "id": 78, "lower": "know", "norm": "know", "shape": "xxxx", "prefix": "k", "suffix": "now", "length": 4, "cluster": "3722", "prob": -6.368943214416504, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "really", "id": 79, "lower": "really", "norm": "really", "shape": "xxxx", "prefix": "r", "suffix": "lly", "length": 6, "cluster": "7802", "prob": -6.370757102966309, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "by", "id": 80, "lower": "by", "norm": "by", "shape": "xx", "prefix": "b", "suffix": "by", "length": 2, "cluster": "252", "prob": -6.375086784362793, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "time", "id": 81, "lower": "time", "norm": "time", "shape": "xxxx", "prefix": "t", "suffix": "ime", "length": 4, "cluster": "477", "prob": -6.3782219886779785, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "did", "id": 82, "lower": "did", "norm": "did", "shape": "xxx", "prefix": "d", "suffix": "did", "length": 3, "cluster": "8186", "prob": -6.389003753662109, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "no", "id": 83, "lower": "no", "norm": "no", "shape": "xx", "prefix": "n", "suffix": "no", "length": 2, "cluster": "4074", "prob": -6.402691841125488, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "had", "id": 84, "lower": "had", "norm": "had", "shape": "xxx", "prefix": "h", "suffix": "had", "length": 3, "cluster": "1914", "prob": -6.45427131652832, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "their", "id": 85, "lower": "their", "norm": "their", "shape": "xxxx", "prefix": "t", "suffix": "eir", "length": 5, "cluster": "187", "prob": -6.461463928222656, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "If", "id": 86, "lower": "if", "norm": "If", "shape": "Xx", "prefix": "I", "suffix": "If", "length": 2, "cluster": "190", "prob": -6.469156742095947, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "how", "id": 87, "lower": "how", "norm": "how", "shape": "xxx", "prefix": "h", "suffix": "how", "length": 3, "cluster": "10218", "prob": -6.496722221374512, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "does", "id": 88, "lower": "does", "norm": "does", "shape": "xxxx", "prefix": "d", "suffix": "oes", "length": 4, "cluster": "4090", "prob": -6.500738143920898, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "who", "id": 89, "lower": "who", "norm": "who", "shape": "xxx", "prefix": "w", "suffix": "who", "length": 3, "cluster": "410", "prob": -6.504637241363525, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "than", "id": 90, "lower": "than", "norm": "than", "shape": "xxxx", "prefix": "t", "suffix": "han", "length": 4, "cluster": "106", "prob": -6.512253761291504, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "good", "id": 91, "lower": "good", "norm": "good", "shape": "xxxx", "prefix": "g", "suffix": "ood", "length": 4, "cluster": "551", "prob": -6.518923759460449, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "only", "id": 92, "lower": "only", "norm": "only", "shape": "xxxx", "prefix": "o", "suffix": "nly", "length": 4, "cluster": "15594", "prob": -6.535442352294922, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "his", "id": 93, "lower": "his", "norm": "his", "shape": "xxx", "prefix": "h", "suffix": "his", "length": 3, "cluster": "123", "prob": -6.574275016784668, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "much", "id": 94, "lower": "much", "norm": "much", "shape": "xxxx", "prefix": "m", "suffix": "uch", "length": 4, "cluster": "2794", "prob": -6.584301948547363, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": ";", "id": 95, "lower": ";", "norm": ";", "shape": ";", "prefix": ";", "suffix": ";", "length": 1, "cluster": "36", "prob": -6.586422920227051, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "'ve", "id": 96, "lower": "'ve", "norm": "'ve", "shape": "'xx", "prefix": "'", "suffix": "'ve", "length": 3, "cluster": "1018", "prob": -6.593011379241943, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "could", "id": 97, "lower": "could", "norm": "could", "shape": "xxxx", "prefix": "c", "suffix": "uld", "length": 5, "cluster": "954", "prob": -6.595959186553955, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "then", "id": 98, "lower": "then", "norm": "then", "shape": "xxxx", "prefix": "t", "suffix": "hen", "length": 4, "cluster": "9962", "prob": -6.598200798034668, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "other", "id": 99, "lower": "other", "norm": "other", "shape": "xxxx", "prefix": "o", "suffix": "her", "length": 5, "cluster": "47", "prob": -6.6438727378845215, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "make", "id": 100, "lower": "make", "norm": "make", "shape": "xxxx", "prefix": "m", "suffix": "ake", "length": 4, "cluster": "4618", "prob": -6.66980504989624, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "been", "id": 101, "lower": "been", "norm": "been", "shape": "xxxx", "prefix": "b", "suffix": "een", "length": 4, "cluster": "202", "prob": -6.670916557312012, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "were", "id": 102, "lower": "were", "norm": "were", "shape": "xxxx", "prefix": "w", "suffix": "ere", "length": 4, "cluster": "506", "prob": -6.673174858093262, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "see", "id": 103, "lower": "see", "norm": "see", "shape": "xxx", "prefix": "s", "suffix": "see", "length": 3, "cluster": "1546", "prob": -6.6828837394714355, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "That", "id": 104, "lower": "that", "norm": "That", "shape": "Xxxx", "prefix": "T", "suffix": "hat", "length": 4, "cluster": "1406", "prob": -6.688080310821533, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "i", "id": 105, "lower": "i", "norm": "i", "shape": "x", "prefix": "i", "suffix": "i", "length": 1, "cluster": "966", "prob": -6.6887712478637695, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "any", "id": 106, "lower": "any", "norm": "any", "shape": "xxx", "prefix": "a", "suffix": "any", "length": 3, "cluster": "12266", "prob": -6.689523220062256, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "want", "id": 107, "lower": "want", "norm": "want", "shape": "xxxx", "prefix": "w", "suffix": "ant", "length": 4, "cluster": "906", "prob": -6.694204807281494, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "even", "id": 108, "lower": "even", "norm": "even", "shape": "xxxx", "prefix": "e", "suffix": "ven", "length": 4, "cluster": "3306", "prob": -6.702912330627441, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "should", "id": 109, "lower": "should", "norm": "should", "shape": "xxxx", "prefix": "s", "suffix": "uld", "length": 6, "cluster": "698", "prob": -6.733259677886963, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "way", "id": 110, "lower": "way", "norm": "way", "shape": "xxx", "prefix": "w", "suffix": "way", "length": 3, "cluster": "1349", "prob": -6.73627233505249, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "'", "id": 111, "lower": "'", "norm": "'", "shape": "'", "prefix": "'", "suffix": "'", "length": 1, "cluster": "916", "prob": -6.73720121383667, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": true, "is_left_punct": true, "is_right_punct": true}
+{"orth": "too", "id": 112, "lower": "too", "norm": "too", "shape": "xxx", "prefix": "t", "suffix": "too", "length": 3, "cluster": "6378", "prob": -6.77581787109375, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "go", "id": 113, "lower": "go", "norm": "go", "shape": "xx", "prefix": "g", "suffix": "go", "length": 2, "cluster": "3466", "prob": -6.775965213775635, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "him", "id": 114, "lower": "him", "norm": "him", "shape": "xxx", "prefix": "h", "suffix": "him", "length": 3, "cluster": "1898", "prob": -6.783067226409912, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "This", "id": 115, "lower": "this", "norm": "This", "shape": "Xxxx", "prefix": "T", "suffix": "his", "length": 4, "cluster": "382", "prob": -6.78391695022583, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "her", "id": 116, "lower": "her", "norm": "her", "shape": "xxx", "prefix": "h", "suffix": "her", "length": 3, "cluster": "507", "prob": -6.798486709594727, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "going", "id": 117, "lower": "going", "norm": "going", "shape": "xxxx", "prefix": "g", "suffix": "ing", "length": 5, "cluster": "2090", "prob": -6.833367824554443, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "now", "id": 118, "lower": "now", "norm": "now", "shape": "xxx", "prefix": "n", "suffix": "now", "length": 3, "cluster": "1770", "prob": -6.834407329559326, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "being", "id": 119, "lower": "being", "norm": "being", "shape": "xxxx", "prefix": "b", "suffix": "ing", "length": 5, "cluster": "3818", "prob": -6.845808029174805, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "still", "id": 120, "lower": "still", "norm": "still", "shape": "xxxx", "prefix": "s", "suffix": "ill", "length": 5, "cluster": "1658", "prob": -6.867525100708008, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "into", "id": 121, "lower": "into", "norm": "into", "shape": "xxxx", "prefix": "i", "suffix": "nto", "length": 4, "cluster": "8188", "prob": -6.87359094619751, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "which", "id": 122, "lower": "which", "norm": "which", "shape": "xxxx", "prefix": "w", "suffix": "ich", "length": 5, "cluster": "154", "prob": -6.877470970153809, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "something", "id": 123, "lower": "something", "norm": "something", "shape": "xxxx", "prefix": "s", "suffix": "ing", "length": 9, "cluster": "14314", "prob": -6.887354850769043, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "she", "id": 124, "lower": "she", "norm": "she", "shape": "xxx", "prefix": "s", "suffix": "she", "length": 3, "cluster": "218", "prob": -6.90155553817749, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "also", "id": 125, "lower": "also", "norm": "also", "shape": "xxxx", "prefix": "a", "suffix": "lso", "length": 4, "cluster": "122", "prob": -6.928974151611328, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "very", "id": 126, "lower": "very", "norm": "very", "shape": "xxxx", "prefix": "v", "suffix": "ery", "length": 4, "cluster": "234", "prob": -6.93242883682251, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "right", "id": 127, "lower": "right", "norm": "right", "shape": "xxxx", "prefix": "r", "suffix": "ght", "length": 5, "cluster": "14122", "prob": -6.933711051940918, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "game", "id": 128, "lower": "game", "norm": "game", "shape": "xxxx", "prefix": "g", "suffix": "ame", "length": 4, "cluster": "7973", "prob": -6.940612316131592, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "say", "id": 129, "lower": "say", "norm": "say", "shape": "xxx", "prefix": "s", "suffix": "say", "length": 3, "cluster": "1162", "prob": -6.950479984283447, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "'ll", "id": 130, "lower": "'ll", "norm": "'ll", "shape": "'xx", "prefix": "'", "suffix": "'ll", "length": 3, "cluster": "5114", "prob": -6.958071231842041, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "got", "id": 131, "lower": "got", "norm": "got", "shape": "xxx", "prefix": "g", "suffix": "got", "length": 3, "cluster": "10666", "prob": -6.98855447769165, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "well", "id": 132, "lower": "well", "norm": "well", "shape": "xxxx", "prefix": "w", "suffix": "ell", "length": 4, "cluster": "746", "prob": -6.995903968811035, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "need", "id": 133, "lower": "need", "norm": "need", "shape": "xxxx", "prefix": "n", "suffix": "eed", "length": 4, "cluster": "2954", "prob": -7.008103370666504, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "And", "id": 134, "lower": "and", "norm": "And", "shape": "Xxx", "prefix": "A", "suffix": "And", "length": 3, "cluster": "1470", "prob": -7.012199401855469, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "But", "id": 135, "lower": "but", "norm": "But", "shape": "Xxx", "prefix": "B", "suffix": "But", "length": 3, "cluster": "1470", "prob": -7.0142974853515625, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "over", "id": 136, "lower": "over", "norm": "over", "shape": "xxxx", "prefix": "o", "suffix": "ver", "length": 4, "cluster": "49148", "prob": -7.027544975280762, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "back", "id": 137, "lower": "back", "norm": "back", "shape": "xxxx", "prefix": "b", "suffix": "ack", "length": 4, "cluster": "7530", "prob": -7.033305644989014, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "same", "id": 138, "lower": "same", "norm": "same", "shape": "xxxx", "prefix": "s", "suffix": "ame", "length": 4, "cluster": "991", "prob": -7.053191661834717, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "thing", "id": 139, "lower": "thing", "norm": "thing", "shape": "xxxx", "prefix": "t", "suffix": "ing", "length": 5, "cluster": "2013", "prob": -7.063167572021484, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "first", "id": 140, "lower": "first", "norm": "first", "shape": "xxxx", "prefix": "f", "suffix": "rst", "length": 5, "cluster": "159", "prob": -7.063716888427734, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "most", "id": 141, "lower": "most", "norm": "most", "shape": "xxxx", "prefix": "m", "suffix": "ost", "length": 4, "cluster": "175", "prob": -7.0663957595825195, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "here", "id": 142, "lower": "here", "norm": "here", "shape": "xxxx", "prefix": "h", "suffix": "ere", "length": 4, "cluster": "3946", "prob": -7.0680251121521, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "ca", "id": 143, "lower": "ca", "norm": "ca", "shape": "xx", "prefix": "c", "suffix": "ca", "length": 2, "cluster": "0", "prob": -7.071251392364502, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "off", "id": 144, "lower": "off", "norm": "off", "shape": "xxx", "prefix": "o", "suffix": "off", "length": 3, "cluster": "6506", "prob": -7.073742389678955, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "'d", "id": 145, "lower": "'d", "norm": "'d", "shape": "'x", "prefix": "'", "suffix": "'d", "length": 2, "cluster": "5114", "prob": -7.075286865234375, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "They", "id": 146, "lower": "they", "norm": "They", "shape": "Xxxx", "prefix": "T", "suffix": "hey", "length": 4, "cluster": "1882", "prob": -7.0789008140563965, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "work", "id": 147, "lower": "work", "norm": "work", "shape": "xxxx", "prefix": "w", "suffix": "ork", "length": 4, "cluster": "1973", "prob": -7.081293106079102, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "use", "id": 148, "lower": "use", "norm": "use", "shape": "xxx", "prefix": "u", "suffix": "use", "length": 3, "cluster": "2741", "prob": -7.083596229553223, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "never", "id": 149, "lower": "never", "norm": "never", "shape": "xxxx", "prefix": "n", "suffix": "ver", "length": 5, "cluster": "15994", "prob": -7.084620475769043, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "better", "id": 150, "lower": "better", "norm": "better", "shape": "xxxx", "prefix": "b", "suffix": "ter", "length": 6, "cluster": "7658", "prob": -7.1072587966918945, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "though", "id": 151, "lower": "though", "norm": "though", "shape": "xxxx", "prefix": "t", "suffix": "ugh", "length": 6, "cluster": "2004", "prob": -7.113335132598877, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "lot", "id": 152, "lower": "lot", "norm": "lot", "shape": "xxx", "prefix": "l", "suffix": "lot", "length": 3, "cluster": "853", "prob": -7.113600254058838, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "pretty", "id": 153, "lower": "pretty", "norm": "pretty", "shape": "xxxx", "prefix": "p", "suffix": "tty", "length": 6, "cluster": "234", "prob": -7.1256103515625, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "where", "id": 154, "lower": "where", "norm": "where", "shape": "xxxx", "prefix": "w", "suffix": "ere", "length": 5, "cluster": "8148", "prob": -7.146170139312744, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "am", "id": 155, "lower": "am", "norm": "am", "shape": "xx", "prefix": "a", "suffix": "am", "length": 2, "cluster": "3066", "prob": -7.149725437164307, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "things", "id": 156, "lower": "things", "norm": "things", "shape": "xxxx", "prefix": "t", "suffix": "ngs", "length": 6, "cluster": "3917", "prob": -7.154941082000732, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "sure", "id": 157, "lower": "sure", "norm": "sure", "shape": "xxxx", "prefix": "s", "suffix": "ure", "length": 4, "cluster": "490", "prob": -7.157395839691162, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "actually", "id": 158, "lower": "actually", "norm": "actually", "shape": "xxxx", "prefix": "a", "suffix": "lly", "length": 8, "cluster": "7802", "prob": -7.160778045654297, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "He", "id": 159, "lower": "he", "norm": "He", "shape": "Xx", "prefix": "H", "suffix": "He", "length": 2, "cluster": "126", "prob": -7.162238121032715, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "those", "id": 160, "lower": "those", "norm": "those", "shape": "xxxx", "prefix": "t", "suffix": "ose", "length": 5, "cluster": "495", "prob": -7.169255256652832, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "why", "id": 161, "lower": "why", "norm": "why", "shape": "xxx", "prefix": "w", "suffix": "why", "length": 3, "cluster": "18410", "prob": -7.178915500640869, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "So", "id": 162, "lower": "so", "norm": "So", "shape": "Xx", "prefix": "S", "suffix": "So", "length": 2, "cluster": "1726", "prob": -7.199381351470947, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "take", "id": 163, "lower": "take", "norm": "take", "shape": "xxxx", "prefix": "t", "suffix": "ake", "length": 4, "cluster": "6666", "prob": -7.209812641143799, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "down", "id": 164, "lower": "down", "norm": "down", "shape": "xxxx", "prefix": "d", "suffix": "own", "length": 4, "cluster": "2410", "prob": -7.223586082458496, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "What", "id": 165, "lower": "what", "norm": "What", "shape": "Xxxx", "prefix": "W", "suffix": "hat", "length": 4, "cluster": "702", "prob": -7.226758003234863, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "someone", "id": 166, "lower": "someone", "norm": "someone", "shape": "xxxx", "prefix": "s", "suffix": "one", "length": 7, "cluster": "30698", "prob": -7.249640464782715, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "before", "id": 167, "lower": "before", "norm": "before", "shape": "xxxx", "prefix": "b", "suffix": "ore", "length": 6, "cluster": "1492", "prob": -7.253359794616699, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "said", "id": 168, "lower": "said", "norm": "said", "shape": "xxxx", "prefix": "s", "suffix": "aid", "length": 4, "cluster": "116", "prob": -7.258025169372559, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "after", "id": 169, "lower": "after", "norm": "after", "shape": "xxxx", "prefix": "a", "suffix": "ter", "length": 5, "cluster": "3540", "prob": -7.265651702880859, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "around", "id": 170, "lower": "around", "norm": "around", "shape": "xxxx", "prefix": "a", "suffix": "und", "length": 6, "cluster": "245756", "prob": -7.313362121582031, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "its", "id": 171, "lower": "its", "norm": "its", "shape": "xxx", "prefix": "i", "suffix": "its", "length": 3, "cluster": "27", "prob": -7.321457862854004, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "2", "id": 172, "lower": "2", "norm": "2", "shape": "d", "prefix": "2", "suffix": "2", "length": 1, "cluster": "818", "prob": -7.324268341064453, "is_alpha": false, "is_ascii": true, "is_digit": true, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "feel", "id": 173, "lower": "feel", "norm": "feel", "shape": "xxxx", "prefix": "f", "suffix": "eel", "length": 4, "cluster": "1674", "prob": -7.342533588409424, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "My", "id": 174, "lower": "my", "norm": "My", "shape": "Xx", "prefix": "M", "suffix": "My", "length": 2, "cluster": "94", "prob": -7.345071792602539, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "There", "id": 175, "lower": "there", "norm": "There", "shape": "Xxxxx", "prefix": "T", "suffix": "ere", "length": 5, "cluster": "1918", "prob": -7.347356796264648, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "look", "id": 176, "lower": "look", "norm": "look", "shape": "xxxx", "prefix": "l", "suffix": "ook", "length": 4, "cluster": "2442", "prob": -7.352481365203857, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "these", "id": 177, "lower": "these", "norm": "these", "shape": "xxxx", "prefix": "t", "suffix": "ese", "length": 5, "cluster": "1519", "prob": -7.36269474029541, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "years", "id": 178, "lower": "years", "norm": "years", "shape": "xxxx", "prefix": "y", "suffix": "ars", "length": 5, "cluster": "189", "prob": -7.368987560272217, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "love", "id": 179, "lower": "love", "norm": "love", "shape": "xxxx", "prefix": "l", "suffix": "ove", "length": 4, "cluster": "2661", "prob": -7.372685432434082, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "always", "id": 180, "lower": "always", "norm": "always", "shape": "xxxx", "prefix": "a", "suffix": "ays", "length": 6, "cluster": "15994", "prob": -7.37296724319458, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "many", "id": 181, "lower": "many", "norm": "many", "shape": "xxxx", "prefix": "m", "suffix": "any", "length": 4, "cluster": "751", "prob": -7.377613067626953, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "&gt", "id": 0, "lower": "&gt", "norm": "&gt", "shape": "&xx", "prefix": "&", "suffix": "&gt", "length": 3, "cluster": "0", "prob": -7.38146448135376, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "A", "id": 182, "lower": "a", "norm": "A", "shape": "X", "prefix": "A", "suffix": "A", "length": 1, "cluster": "222", "prob": -7.38541841506958, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": true, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "point", "id": 183, "lower": "point", "norm": "point", "shape": "xxxx", "prefix": "p", "suffix": "int", "length": 5, "cluster": "389", "prob": -7.386973857879639, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "find", "id": 184, "lower": "find", "norm": "find", "shape": "xxxx", "prefix": "f", "suffix": "ind", "length": 4, "cluster": "5642", "prob": -7.387212753295898, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "probably", "id": 185, "lower": "probably", "norm": "probably", "shape": "xxxx", "prefix": "p", "suffix": "bly", "length": 8, "cluster": "5754", "prob": -7.395048141479492, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "new", "id": 186, "lower": "new", "norm": "new", "shape": "xxx", "prefix": "n", "suffix": "new", "length": 3, "cluster": "199", "prob": -7.398182392120361, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "made", "id": 187, "lower": "made", "norm": "made", "shape": "xxxx", "prefix": "m", "suffix": "ade", "length": 4, "cluster": "120490", "prob": -7.399899005889893, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "day", "id": 188, "lower": "day", "norm": "day", "shape": "xxx", "prefix": "d", "suffix": "day", "length": 3, "cluster": "989", "prob": -7.400947093963623, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "We", "id": 189, "lower": "we", "norm": "We", "shape": "Xx", "prefix": "W", "suffix": "We", "length": 2, "cluster": "858", "prob": -7.402578353881836, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "every", "id": 190, "lower": "every", "norm": "every", "shape": "xxxx", "prefix": "e", "suffix": "ery", "length": 5, "cluster": "61418", "prob": -7.414647579193115, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "great", "id": 191, "lower": "great", "norm": "great", "shape": "xxxx", "prefix": "g", "suffix": "eat", "length": 5, "cluster": "1831", "prob": -7.420454502105713, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "our", "id": 192, "lower": "our", "norm": "our", "shape": "xxx", "prefix": "o", "suffix": "our", "length": 3, "cluster": "59", "prob": -7.4210286140441895, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "two", "id": 193, "lower": "two", "norm": "two", "shape": "xxx", "prefix": "t", "suffix": "two", "length": 3, "cluster": "15", "prob": -7.433600425720215, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "anything", "id": 194, "lower": "anything", "norm": "anything", "shape": "xxxx", "prefix": "a", "suffix": "ing", "length": 8, "cluster": "14314", "prob": -7.439383506774902, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "while", "id": 195, "lower": "while", "norm": "while", "shape": "xxxx", "prefix": "w", "suffix": "ile", "length": 5, "cluster": "6100", "prob": -7.440170764923096, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "few", "id": 196, "lower": "few", "norm": "few", "shape": "xxx", "prefix": "f", "suffix": "few", "length": 3, "cluster": "79", "prob": -7.440912246704102, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "$", "id": 197, "lower": "$", "norm": "$", "shape": "$", "prefix": "$", "suffix": "$", "length": 1, "cluster": "18", "prob": -7.450106620788574, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "bad", "id": 198, "lower": "bad", "norm": "bad", "shape": "xxx", "prefix": "b", "suffix": "bad", "length": 3, "cluster": "551", "prob": -7.452563762664795, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "No", "id": 199, "lower": "no", "norm": "No", "shape": "Xx", "prefix": "N", "suffix": "No", "length": 2, "cluster": "94", "prob": -7.456389427185059, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "little", "id": 200, "lower": "little", "norm": "little", "shape": "xxxx", "prefix": "l", "suffix": "tle", "length": 6, "cluster": "1959", "prob": -7.480203628540039, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "might", "id": 201, "lower": "might", "norm": "might", "shape": "xxxx", "prefix": "m", "suffix": "ght", "length": 5, "cluster": "186", "prob": -7.490107536315918, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "best", "id": 202, "lower": "best", "norm": "best", "shape": "xxxx", "prefix": "b", "suffix": "est", "length": 4, "cluster": "479", "prob": -7.492556571960449, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "play", "id": 203, "lower": "play", "norm": "play", "shape": "xxxx", "prefix": "p", "suffix": "lay", "length": 4, "cluster": "1717", "prob": -7.50220251083374, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "shit", "id": 204, "lower": "shit", "norm": "shit", "shape": "xxxx", "prefix": "s", "suffix": "hit", "length": 4, "cluster": "0", "prob": -7.522359371185303, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "try", "id": 205, "lower": "try", "norm": "try", "shape": "xxx", "prefix": "t", "suffix": "try", "length": 3, "cluster": "1930", "prob": -7.540920734405518, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "used", "id": 206, "lower": "used", "norm": "used", "shape": "xxxx", "prefix": "u", "suffix": "sed", "length": 4, "cluster": "15402", "prob": -7.542972087860107, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "long", "id": 207, "lower": "long", "norm": "long", "shape": "xxxx", "prefix": "l", "suffix": "ong", "length": 4, "cluster": "935", "prob": -7.544892311096191, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "doing", "id": 208, "lower": "doing", "norm": "doing", "shape": "xxxx", "prefix": "d", "suffix": "ing", "length": 5, "cluster": "15338", "prob": -7.553442478179932, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "getting", "id": 209, "lower": "getting", "norm": "getting", "shape": "xxxx", "prefix": "g", "suffix": "ing", "length": 7, "cluster": "31722", "prob": -7.564762115478516, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "post", "id": 210, "lower": "post", "norm": "post", "shape": "xxxx", "prefix": "p", "suffix": "ost", "length": 4, "cluster": "3733", "prob": -7.565684795379639, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "year", "id": 211, "lower": "year", "norm": "year", "shape": "xxxx", "prefix": "y", "suffix": "ear", "length": 4, "cluster": "29", "prob": -7.567681312561035, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Do", "id": 212, "lower": "do", "norm": "Do", "shape": "Xx", "prefix": "D", "suffix": "Do", "length": 2, "cluster": "702", "prob": -7.570033073425293, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "life", "id": 213, "lower": "life", "norm": "life", "shape": "xxxx", "prefix": "l", "suffix": "ife", "length": 4, "cluster": "1893", "prob": -7.574200630187988, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "through", "id": 214, "lower": "through", "norm": "through", "shape": "xxxx", "prefix": "t", "suffix": "ugh", "length": 7, "cluster": "65532", "prob": -7.575429439544678, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "guy", "id": 215, "lower": "guy", "norm": "guy", "shape": "xxx", "prefix": "g", "suffix": "guy", "length": 3, "cluster": "549", "prob": -7.582011699676514, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "enough", "id": 216, "lower": "enough", "norm": "enough", "shape": "xxxx", "prefix": "e", "suffix": "ugh", "length": 6, "cluster": "1834", "prob": -7.586349010467529, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "ever", "id": 217, "lower": "ever", "norm": "ever", "shape": "xxxx", "prefix": "e", "suffix": "ver", "length": 4, "cluster": "14058", "prob": -7.591183662414551, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "In", "id": 218, "lower": "in", "norm": "In", "shape": "Xx", "prefix": "I", "suffix": "In", "length": 2, "cluster": "62", "prob": -7.603263854980469, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "give", "id": 219, "lower": "give", "norm": "give", "shape": "xxxx", "prefix": "g", "suffix": "ive", "length": 4, "cluster": "522", "prob": -7.611863136291504, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "mean", "id": 220, "lower": "mean", "norm": "mean", "shape": "xxxx", "prefix": "m", "suffix": "ean", "length": 4, "cluster": "3082", "prob": -7.611870765686035, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "thought", "id": 221, "lower": "thought", "norm": "thought", "shape": "xxxx", "prefix": "t", "suffix": "ght", "length": 7, "cluster": "650", "prob": -7.614910125732422, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "since", "id": 222, "lower": "since", "norm": "since", "shape": "xxxx", "prefix": "s", "suffix": "nce", "length": 5, "cluster": "468", "prob": -7.615171909332275, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "|", "id": 223, "lower": "|", "norm": "|", "shape": "|", "prefix": "|", "suffix": "|", "length": 1, "cluster": "0", "prob": -7.6297454833984375, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "different", "id": 224, "lower": "different", "norm": "different", "shape": "xxxx", "prefix": "d", "suffix": "ent", "length": 9, "cluster": "1319", "prob": -7.630640506744385, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "3", "id": 225, "lower": "3", "norm": "3", "shape": "d", "prefix": "3", "suffix": "3", "length": 1, "cluster": "818", "prob": -7.636006832122803, "is_alpha": false, "is_ascii": true, "is_digit": true, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "last", "id": 226, "lower": "last", "norm": "last", "shape": "xxxx", "prefix": "l", "suffix": "ast", "length": 4, "cluster": "127", "prob": -7.636077404022217, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "own", "id": 227, "lower": "own", "norm": "own", "shape": "xxx", "prefix": "o", "suffix": "own", "length": 3, "cluster": "217", "prob": -7.636797904968262, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "1", "id": 228, "lower": "1", "norm": "1", "shape": "d", "prefix": "1", "suffix": "1", "length": 1, "cluster": "306", "prob": -7.639832973480225, "is_alpha": false, "is_ascii": true, "is_digit": true, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "us", "id": 229, "lower": "us", "norm": "us", "shape": "xx", "prefix": "u", "suffix": "us", "length": 2, "cluster": "1898", "prob": -7.643693923950195, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "%", "id": 230, "lower": "%", "norm": "%", "shape": "%", "prefix": "%", "suffix": "%", "length": 1, "cluster": "34", "prob": -7.645323753356934, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Not", "id": 231, "lower": "not", "norm": "Not", "shape": "Xxx", "prefix": "N", "suffix": "Not", "length": 3, "cluster": "1982", "prob": -7.65825080871582, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "put", "id": 232, "lower": "put", "norm": "put", "shape": "xxx", "prefix": "p", "suffix": "put", "length": 3, "cluster": "6314", "prob": -7.666473865509033, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "man", "id": 233, "lower": "man", "norm": "man", "shape": "xxx", "prefix": "m", "suffix": "man", "length": 3, "cluster": "549", "prob": -7.668745517730713, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "may", "id": 234, "lower": "may", "norm": "may", "shape": "xxx", "prefix": "m", "suffix": "may", "length": 3, "cluster": "186", "prob": -7.678494930267334, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "makes", "id": 235, "lower": "makes", "norm": "makes", "shape": "xxxx", "prefix": "m", "suffix": "kes", "length": 5, "cluster": "426", "prob": -7.684445858001709, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "money", "id": 236, "lower": "money", "norm": "money", "shape": "xxxx", "prefix": "m", "suffix": "ney", "length": 5, "cluster": "357", "prob": -7.693631172180176, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": ":)", "id": 237, "lower": ":)", "norm": ":)", "shape": ":)", "prefix": ":", "suffix": ":)", "length": 2, "cluster": "0", "prob": -7.694086074829102, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "without", "id": 238, "lower": "without", "norm": "without", "shape": "xxxx", "prefix": "w", "suffix": "out", "length": 7, "cluster": "57340", "prob": -7.694504261016846, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "bit", "id": 239, "lower": "bit", "norm": "bit", "shape": "xxx", "prefix": "b", "suffix": "bit", "length": 3, "cluster": "853", "prob": -7.721855640411377, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "person", "id": 240, "lower": "person", "norm": "person", "shape": "xxxx", "prefix": "p", "suffix": "son", "length": 6, "cluster": "549", "prob": -7.727076530456543, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Also", "id": 241, "lower": "also", "norm": "Also", "shape": "Xxxx", "prefix": "A", "suffix": "lso", "length": 4, "cluster": "254", "prob": -7.734253406524658, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "again", "id": 242, "lower": "again", "norm": "again", "shape": "xxxx", "prefix": "a", "suffix": "ain", "length": 5, "cluster": "28522", "prob": -7.7370924949646, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Just", "id": 243, "lower": "just", "norm": "Just", "shape": "Xxxx", "prefix": "J", "suffix": "ust", "length": 4, "cluster": "1982", "prob": -7.743429183959961, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "both", "id": 244, "lower": "both", "norm": "both", "shape": "xxxx", "prefix": "b", "suffix": "oth", "length": 4, "cluster": "1007", "prob": -7.750914573669434, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "help", "id": 245, "lower": "help", "norm": "help", "shape": "xxxx", "prefix": "h", "suffix": "elp", "length": 4, "cluster": "309", "prob": -7.758815288543701, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "trying", "id": 246, "lower": "trying", "norm": "trying", "shape": "xxxx", "prefix": "t", "suffix": "ing", "length": 6, "cluster": "14378", "prob": -7.759474754333496, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "least", "id": 247, "lower": "least", "norm": "least", "shape": "xxxx", "prefix": "l", "suffix": "ast", "length": 5, "cluster": "3690", "prob": -7.7660088539123535, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "come", "id": 248, "lower": "come", "norm": "come", "shape": "xxxx", "prefix": "c", "suffix": "ome", "length": 4, "cluster": "7562", "prob": -7.775856971740723, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "keep", "id": 249, "lower": "keep", "norm": "keep", "shape": "xxxx", "prefix": "k", "suffix": "eep", "length": 4, "cluster": "3338", "prob": -7.778285980224609, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Thanks", "id": 250, "lower": "thanks", "norm": "Thanks", "shape": "Xxxxx", "prefix": "T", "suffix": "nks", "length": 6, "cluster": "510", "prob": -7.781467914581299, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "read", "id": 251, "lower": "read", "norm": "read", "shape": "xxxx", "prefix": "r", "suffix": "ead", "length": 4, "cluster": "6314", "prob": -7.787075042724609, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "nt", "id": 252, "lower": "nt", "norm": "nt", "shape": "xx", "prefix": "n", "suffix": "nt", "length": 2, "cluster": "3685", "prob": -7.788322925567627, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "part", "id": 253, "lower": "part", "norm": "part", "shape": "xxxx", "prefix": "p", "suffix": "art", "length": 4, "cluster": "725", "prob": -7.791079521179199, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "let", "id": 254, "lower": "let", "norm": "let", "shape": "xxx", "prefix": "l", "suffix": "let", "length": 3, "cluster": "522", "prob": -7.795135974884033, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "hard", "id": 255, "lower": "hard", "norm": "hard", "shape": "xxxx", "prefix": "h", "suffix": "ard", "length": 4, "cluster": "2538", "prob": -7.795384407043457, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "another", "id": 256, "lower": "another", "norm": "another", "shape": "xxxx", "prefix": "a", "suffix": "her", "length": 7, "cluster": "28650", "prob": -7.801506519317627, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "end", "id": 257, "lower": "end", "norm": "end", "shape": "xxx", "prefix": "e", "suffix": "end", "length": 3, "cluster": "21", "prob": -7.816553115844727, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "having", "id": 258, "lower": "having", "norm": "having", "shape": "xxxx", "prefix": "h", "suffix": "ing", "length": 6, "cluster": "130026", "prob": -7.818792819976807, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "As", "id": 259, "lower": "as", "norm": "As", "shape": "Xx", "prefix": "A", "suffix": "As", "length": 2, "cluster": "958", "prob": -7.836142539978027, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "games", "id": 260, "lower": "games", "norm": "games", "shape": "xxxx", "prefix": "g", "suffix": "mes", "length": 5, "cluster": "1485", "prob": -7.836157321929932, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "already", "id": 261, "lower": "already", "norm": "already", "shape": "xxxx", "prefix": "a", "suffix": "ady", "length": 7, "cluster": "634", "prob": -7.838688850402832, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "..", "id": 0, "lower": "..", "norm": "..", "shape": "..", "prefix": ".", "suffix": "..", "length": 2, "cluster": "4906", "prob": -7.840396404266357, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "problem", "id": 262, "lower": "problem", "norm": "problem", "shape": "xxxx", "prefix": "p", "suffix": "lem", "length": 7, "cluster": "16069", "prob": -7.841479301452637, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "kind", "id": 263, "lower": "kind", "norm": "kind", "shape": "xxxx", "prefix": "k", "suffix": "ind", "length": 4, "cluster": "213", "prob": -7.844367980957031, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "old", "id": 264, "lower": "old", "norm": "old", "shape": "xxx", "prefix": "o", "suffix": "old", "length": 3, "cluster": "2346", "prob": -7.845602989196777, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "everyone", "id": 265, "lower": "everyone", "norm": "everyone", "shape": "xxxx", "prefix": "e", "suffix": "one", "length": 8, "cluster": "30698", "prob": -7.850788116455078, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "saying", "id": 266, "lower": "saying", "norm": "saying", "shape": "xxxx", "prefix": "s", "suffix": "ing", "length": 6, "cluster": "3732", "prob": -7.854340076446533, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "idea", "id": 267, "lower": "idea", "norm": "idea", "shape": "xxxx", "prefix": "i", "suffix": "dea", "length": 4, "cluster": "709", "prob": -7.855560779571533, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "else", "id": 268, "lower": "else", "norm": "else", "shape": "xxxx", "prefix": "e", "suffix": "lse", "length": 4, "cluster": "2013", "prob": -7.86043643951416, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "reason", "id": 269, "lower": "reason", "norm": "reason", "shape": "xxxx", "prefix": "r", "suffix": "son", "length": 6, "cluster": "113", "prob": -7.867291450500488, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Well", "id": 270, "lower": "well", "norm": "Well", "shape": "Xxxx", "prefix": "W", "suffix": "ell", "length": 4, "cluster": "1726", "prob": -7.871857643127441, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "less", "id": 271, "lower": "less", "norm": "less", "shape": "xxxx", "prefix": "l", "suffix": "ess", "length": 4, "cluster": "5610", "prob": -7.872425079345703, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "world", "id": 272, "lower": "world", "norm": "world", "shape": "xxxx", "prefix": "w", "suffix": "rld", "length": 5, "cluster": "329", "prob": -7.8744120597839355, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "wrong", "id": 273, "lower": "wrong", "norm": "wrong", "shape": "xxxx", "prefix": "w", "suffix": "ong", "length": 5, "cluster": "4586", "prob": -7.876842021942139, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "How", "id": 274, "lower": "how", "norm": "How", "shape": "Xxx", "prefix": "H", "suffix": "How", "length": 3, "cluster": "702", "prob": -7.879385948181152, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "far", "id": 275, "lower": "far", "norm": "far", "shape": "xxx", "prefix": "f", "suffix": "far", "length": 3, "cluster": "6890", "prob": -7.8802924156188965, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "big", "id": 276, "lower": "big", "norm": "big", "shape": "xxx", "prefix": "b", "suffix": "big", "length": 3, "cluster": "135", "prob": -7.880735874176025, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "done", "id": 277, "lower": "done", "norm": "done", "shape": "xxxx", "prefix": "d", "suffix": "one", "length": 4, "cluster": "26282", "prob": -7.886453151702881, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "believe", "id": 278, "lower": "believe", "norm": "believe", "shape": "xxxx", "prefix": "b", "suffix": "eve", "length": 7, "cluster": "138", "prob": -7.886724948883057, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Yeah", "id": 279, "lower": "yeah", "norm": "Yeah", "shape": "Xxxx", "prefix": "Y", "suffix": "eah", "length": 4, "cluster": "1726", "prob": -7.890377044677734, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "such", "id": 280, "lower": "such", "norm": "such", "shape": "xxxx", "prefix": "s", "suffix": "uch", "length": 4, "cluster": "111", "prob": -7.894707679748535, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "stuff", "id": 281, "lower": "stuff", "norm": "stuff", "shape": "xxxx", "prefix": "s", "suffix": "uff", "length": 5, "cluster": "6853", "prob": -7.898244380950928, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "away", "id": 282, "lower": "away", "norm": "away", "shape": "xxxx", "prefix": "a", "suffix": "way", "length": 4, "cluster": "3434", "prob": -7.9017462730407715, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "nothing", "id": 283, "lower": "nothing", "norm": "nothing", "shape": "xxxx", "prefix": "n", "suffix": "ing", "length": 7, "cluster": "14314", "prob": -7.909971714019775, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "tell", "id": 284, "lower": "tell", "norm": "tell", "shape": "xxxx", "prefix": "t", "suffix": "ell", "length": 4, "cluster": "1546", "prob": -7.910365581512451, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "looking", "id": 285, "lower": "looking", "norm": "looking", "shape": "xxxx", "prefix": "l", "suffix": "ing", "length": 7, "cluster": "1066", "prob": -7.911639213562012, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "start", "id": 286, "lower": "start", "norm": "start", "shape": "xxxx", "prefix": "s", "suffix": "art", "length": 5, "cluster": "3978", "prob": -7.923925876617432, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "using", "id": 287, "lower": "using", "norm": "using", "shape": "xxxx", "prefix": "u", "suffix": "ing", "length": 5, "cluster": "7146", "prob": -7.938363075256348, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "able", "id": 288, "lower": "able", "norm": "able", "shape": "xxxx", "prefix": "a", "suffix": "ble", "length": 4, "cluster": "6186", "prob": -7.939544677734375, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "place", "id": 289, "lower": "place", "norm": "place", "shape": "xxxx", "prefix": "p", "suffix": "ace", "length": 5, "cluster": "6245", "prob": -7.954748153686523, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "high", "id": 290, "lower": "high", "norm": "high", "shape": "xxxx", "prefix": "h", "suffix": "igh", "length": 4, "cluster": "167", "prob": -7.963760852813721, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "until", "id": 291, "lower": "until", "norm": "until", "shape": "xxxx", "prefix": "u", "suffix": "til", "length": 5, "cluster": "2516", "prob": -7.964784622192383, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "either", "id": 292, "lower": "either", "norm": "either", "shape": "xxxx", "prefix": "e", "suffix": "her", "length": 6, "cluster": "30698", "prob": -7.965897560119629, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "seen", "id": 293, "lower": "seen", "norm": "seen", "shape": "xxxx", "prefix": "s", "suffix": "een", "length": 4, "cluster": "26282", "prob": -7.97322416305542, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "times", "id": 294, "lower": "times", "norm": "times", "shape": "xxxx", "prefix": "t", "suffix": "mes", "length": 5, "cluster": "61", "prob": -7.9734907150268555, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "real", "id": 295, "lower": "real", "norm": "real", "shape": "xxxx", "prefix": "r", "suffix": "eal", "length": 4, "cluster": "503", "prob": -7.981620788574219, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "When", "id": 296, "lower": "when", "norm": "When", "shape": "Xxxx", "prefix": "W", "suffix": "hen", "length": 4, "cluster": "190", "prob": -7.982150554656982, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "making", "id": 297, "lower": "making", "norm": "making", "shape": "xxxx", "prefix": "m", "suffix": "ing", "length": 6, "cluster": "7146", "prob": -7.985988616943359, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "seems", "id": 298, "lower": "seems", "norm": "seems", "shape": "xxxx", "prefix": "s", "suffix": "ems", "length": 5, "cluster": "16298", "prob": -7.989145278930664, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "fuck", "id": 299, "lower": "fuck", "norm": "fuck", "shape": "xxxx", "prefix": "f", "suffix": "uck", "length": 4, "cluster": "0", "prob": -7.992913246154785, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "fucking", "id": 300, "lower": "fucking", "norm": "fucking", "shape": "xxxx", "prefix": "f", "suffix": "ing", "length": 7, "cluster": "0", "prob": -7.993165969848633, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "\n\n\n", "id": 0, "lower": "\n\n\n", "norm": "\n\n\n", "shape": "\n\n\n", "prefix": "\n", "suffix": "\n\n\n", "length": 3, "cluster": "0", "prob": -7.996075630187988, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "next", "id": 301, "lower": "next", "norm": "next", "shape": "xxxx", "prefix": "n", "suffix": "ext", "length": 4, "cluster": "255", "prob": -7.996739864349365, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "anyone", "id": 302, "lower": "anyone", "norm": "anyone", "shape": "xxxx", "prefix": "a", "suffix": "one", "length": 6, "cluster": "30698", "prob": -7.997350215911865, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "#", "id": 303, "lower": "#", "norm": "#", "shape": "#", "prefix": "#", "suffix": "#", "length": 1, "cluster": "18", "prob": -8.001263618469238, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "looks", "id": 304, "lower": "looks", "norm": "looks", "shape": "xxxx", "prefix": "l", "suffix": "oks", "length": 5, "cluster": "2442", "prob": -8.001678466796875, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "everything", "id": 305, "lower": "everything", "norm": "everything", "shape": "xxxx", "prefix": "e", "suffix": "ing", "length": 10, "cluster": "14314", "prob": -8.00584602355957, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Oh", "id": 306, "lower": "oh", "norm": "Oh", "shape": "Xx", "prefix": "O", "suffix": "Oh", "length": 2, "cluster": "1726", "prob": -8.007224082946777, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "nice", "id": 307, "lower": "nice", "norm": "nice", "shape": "xxxx", "prefix": "n", "suffix": "ice", "length": 4, "cluster": "551", "prob": -8.009806632995605, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "once", "id": 308, "lower": "once", "norm": "once", "shape": "xxxx", "prefix": "o", "suffix": "nce", "length": 4, "cluster": "22250", "prob": -8.010163307189941, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "show", "id": 309, "lower": "show", "norm": "show", "shape": "xxxx", "prefix": "s", "suffix": "how", "length": 4, "cluster": "7690", "prob": -8.011373519897461, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "maybe", "id": 310, "lower": "maybe", "norm": "maybe", "shape": "xxxx", "prefix": "m", "suffix": "ybe", "length": 5, "cluster": "60650", "prob": -8.020626068115234, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "fact", "id": 311, "lower": "fact", "norm": "fact", "shape": "xxxx", "prefix": "f", "suffix": "act", "length": 4, "cluster": "369", "prob": -8.032754898071289, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "wo", "id": 312, "lower": "wo", "norm": "wo", "shape": "xx", "prefix": "w", "suffix": "wo", "length": 2, "cluster": "26", "prob": -8.0400972366333, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "5", "id": 313, "lower": "5", "norm": "5", "shape": "d", "prefix": "5", "suffix": "5", "length": 1, "cluster": "818", "prob": -8.040534019470215, "is_alpha": false, "is_ascii": true, "is_digit": true, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "free", "id": 314, "lower": "free", "norm": "free", "shape": "xxxx", "prefix": "f", "suffix": "ree", "length": 4, "cluster": "6634", "prob": -8.0440092086792, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "understand", "id": 315, "lower": "understand", "norm": "understand", "shape": "xxxx", "prefix": "u", "suffix": "and", "length": 10, "cluster": "3722", "prob": -8.052404403686523, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "team", "id": 316, "lower": "team", "norm": "team", "shape": "xxxx", "prefix": "t", "suffix": "eam", "length": 4, "cluster": "1061", "prob": -8.053070068359375, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "....", "id": 317, "lower": "....", "norm": "....", "shape": "....", "prefix": ".", "suffix": "...", "length": 4, "cluster": "1202", "prob": -8.05477523803711, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "against", "id": 318, "lower": "against", "norm": "against", "shape": "xxxx", "prefix": "a", "suffix": "nst", "length": 7, "cluster": "24572", "prob": -8.064282417297363, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "live", "id": 319, "lower": "live", "norm": "live", "shape": "xxxx", "prefix": "l", "suffix": "ive", "length": 4, "cluster": "1418", "prob": -8.065953254699707, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": " \n\n", "id": 0, "lower": " \n\n", "norm": " \n\n", "shape": " \n\n", "prefix": " ", "suffix": " \n\n", "length": 3, "cluster": "0", "prob": -8.068946838378906, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Why", "id": 320, "lower": "why", "norm": "Why", "shape": "Xxx", "prefix": "W", "suffix": "Why", "length": 3, "cluster": "702", "prob": -8.06901741027832, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "whole", "id": 321, "lower": "whole", "norm": "whole", "shape": "xxxx", "prefix": "w", "suffix": "ole", "length": 5, "cluster": "71", "prob": -8.070209503173828, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "For", "id": 322, "lower": "for", "norm": "For", "shape": "Xxx", "prefix": "F", "suffix": "For", "length": 3, "cluster": "1342", "prob": -8.072200775146484, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "guys", "id": 323, "lower": "guys", "norm": "guys", "shape": "xxxx", "prefix": "g", "suffix": "uys", "length": 4, "cluster": "365", "prob": -8.075167655944824, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "job", "id": 324, "lower": "job", "norm": "job", "shape": "xxx", "prefix": "j", "suffix": "job", "length": 3, "cluster": "37", "prob": -8.082273483276367, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "etc", "id": 325, "lower": "etc", "norm": "etc", "shape": "xxx", "prefix": "e", "suffix": "etc", "length": 3, "cluster": "26", "prob": -8.087606430053711, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "4", "id": 326, "lower": "4", "norm": "4", "shape": "d", "prefix": "4", "suffix": "4", "length": 1, "cluster": "818", "prob": -8.088510513305664, "is_alpha": false, "is_ascii": true, "is_digit": true, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "went", "id": 327, "lower": "went", "norm": "went", "shape": "xxxx", "prefix": "w", "suffix": "ent", "length": 4, "cluster": "7338", "prob": -8.091073989868164, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "school", "id": 328, "lower": "school", "norm": "school", "shape": "xxxx", "prefix": "s", "suffix": "ool", "length": 6, "cluster": "1829", "prob": -8.096077919006348, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "guess", "id": 329, "lower": "guess", "norm": "guess", "shape": "xxxx", "prefix": "g", "suffix": "ess", "length": 5, "cluster": "650", "prob": -8.097951889038086, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "friends", "id": 330, "lower": "friends", "norm": "friends", "shape": "xxxx", "prefix": "f", "suffix": "nds", "length": 7, "cluster": "3565", "prob": -8.10158634185791, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "between", "id": 331, "lower": "between", "norm": "between", "shape": "xxxx", "prefix": "b", "suffix": "een", "length": 7, "cluster": "12284", "prob": -8.106386184692383, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "case", "id": 332, "lower": "case", "norm": "case", "shape": "xxxx", "prefix": "c", "suffix": "ase", "length": 4, "cluster": "3269", "prob": -8.106882095336914, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "She", "id": 333, "lower": "she", "norm": "She", "shape": "Xxx", "prefix": "S", "suffix": "She", "length": 3, "cluster": "126", "prob": -8.119241714477539, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "each", "id": 334, "lower": "each", "norm": "each", "shape": "xxxx", "prefix": "e", "suffix": "ach", "length": 4, "cluster": "32746", "prob": -8.123948097229004, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "fun", "id": 335, "lower": "fun", "norm": "fun", "shape": "xxx", "prefix": "f", "suffix": "fun", "length": 3, "cluster": "16229", "prob": -8.124406814575195, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "agree", "id": 336, "lower": "agree", "norm": "agree", "shape": "xxxx", "prefix": "a", "suffix": "ree", "length": 5, "cluster": "394", "prob": -8.12778091430664, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Is", "id": 337, "lower": "is", "norm": "Is", "shape": "Xx", "prefix": "I", "suffix": "Is", "length": 2, "cluster": "1214", "prob": -8.129456520080566, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "buy", "id": 338, "lower": "buy", "norm": "buy", "shape": "xxx", "prefix": "b", "suffix": "buy", "length": 3, "cluster": "2826", "prob": -8.142950057983398, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Yes", "id": 339, "lower": "yes", "norm": "Yes", "shape": "Xxx", "prefix": "Y", "suffix": "Yes", "length": 3, "cluster": "1726", "prob": -8.147512435913086, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "run", "id": 340, "lower": "run", "norm": "run", "shape": "xxx", "prefix": "r", "suffix": "run", "length": 3, "cluster": "437", "prob": -8.156776428222656, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "change", "id": 341, "lower": "change", "norm": "change", "shape": "xxxx", "prefix": "c", "suffix": "nge", "length": 6, "cluster": "2997", "prob": -8.157740592956543, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "found", "id": 342, "lower": "found", "norm": "found", "shape": "xxxx", "prefix": "f", "suffix": "und", "length": 5, "cluster": "13738", "prob": -8.182107925415039, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "question", "id": 343, "lower": "question", "norm": "question", "shape": "xxxx", "prefix": "q", "suffix": "ion", "length": 8, "cluster": "709", "prob": -8.185464859008789, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "top", "id": 344, "lower": "top", "norm": "top", "shape": "xxx", "prefix": "t", "suffix": "top", "length": 3, "cluster": "1479", "prob": -8.191086769104004, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "playing", "id": 345, "lower": "playing", "norm": "playing", "shape": "xxxx", "prefix": "p", "suffix": "ing", "length": 7, "cluster": "11242", "prob": -8.191595077514648, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "name", "id": 346, "lower": "name", "norm": "name", "shape": "xxxx", "prefix": "n", "suffix": "ame", "length": 4, "cluster": "4021", "prob": -8.19616985321045, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "mind", "id": 347, "lower": "mind", "norm": "mind", "shape": "xxxx", "prefix": "m", "suffix": "ind", "length": 4, "cluster": "1893", "prob": -8.197138786315918, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "myself", "id": 348, "lower": "myself", "norm": "myself", "shape": "xxxx", "prefix": "m", "suffix": "elf", "length": 6, "cluster": "8042", "prob": -8.200143814086914, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "gets", "id": 349, "lower": "gets", "norm": "gets", "shape": "xxxx", "prefix": "g", "suffix": "ets", "length": 4, "cluster": "10666", "prob": -8.202808380126953, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "ago", "id": 350, "lower": "ago", "norm": "ago", "shape": "xxx", "prefix": "a", "suffix": "ago", "length": 3, "cluster": "6442", "prob": -8.206598281860352, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "friend", "id": 351, "lower": "friend", "norm": "friend", "shape": "xxxx", "prefix": "f", "suffix": "end", "length": 6, "cluster": "1061", "prob": -8.210515975952148, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "talking", "id": 352, "lower": "talking", "norm": "talking", "shape": "xxxx", "prefix": "t", "suffix": "ing", "length": 7, "cluster": "4586", "prob": -8.22729778289795, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "days", "id": 353, "lower": "days", "norm": "days", "shape": "xxxx", "prefix": "d", "suffix": "ays", "length": 4, "cluster": "317", "prob": -8.227437973022461, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "yet", "id": 354, "lower": "yet", "norm": "yet", "shape": "xxx", "prefix": "y", "suffix": "yet", "length": 3, "cluster": "32490", "prob": -8.229137420654297, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "means", "id": 355, "lower": "means", "norm": "means", "shape": "xxxx", "prefix": "m", "suffix": "ans", "length": 5, "cluster": "31146", "prob": -8.234617233276367, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "hope", "id": 356, "lower": "hope", "norm": "hope", "shape": "xxxx", "prefix": "h", "suffix": "ope", "length": 4, "cluster": "650", "prob": -8.236272811889648, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "almost", "id": 357, "lower": "almost", "norm": "almost", "shape": "xxxx", "prefix": "a", "suffix": "ost", "length": 6, "cluster": "7402", "prob": -8.236738204956055, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "yourself", "id": 358, "lower": "yourself", "norm": "yourself", "shape": "xxxx", "prefix": "y", "suffix": "elf", "length": 8, "cluster": "8042", "prob": -8.2402982711792, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "awesome", "id": 359, "lower": "awesome", "norm": "awesome", "shape": "xxxx", "prefix": "a", "suffix": "ome", "length": 7, "cluster": "871", "prob": -8.247021675109863, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "care", "id": 360, "lower": "care", "norm": "care", "shape": "xxxx", "prefix": "c", "suffix": "are", "length": 4, "cluster": "1229", "prob": -8.248679161071777, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "quite", "id": 361, "lower": "quite", "norm": "quite", "shape": "xxxx", "prefix": "q", "suffix": "ite", "length": 5, "cluster": "15338", "prob": -8.254060745239258, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "10", "id": 362, "lower": "10", "norm": "10", "shape": "dd", "prefix": "1", "suffix": "10", "length": 2, "cluster": "1970", "prob": -8.258377075195312, "is_alpha": false, "is_ascii": true, "is_digit": true, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "true", "id": 363, "lower": "true", "norm": "true", "shape": "xxxx", "prefix": "t", "suffix": "rue", "length": 4, "cluster": "4586", "prob": -8.259368896484375, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "remember", "id": 364, "lower": "remember", "norm": "remember", "shape": "xxxx", "prefix": "r", "suffix": "ber", "length": 8, "cluster": "3722", "prob": -8.259916305541992, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "definitely", "id": 365, "lower": "definitely", "norm": "definitely", "shape": "xxxx", "prefix": "d", "suffix": "ely", "length": 10, "cluster": "7802", "prob": -8.264209747314453, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "call", "id": 366, "lower": "call", "norm": "call", "shape": "xxxx", "prefix": "c", "suffix": "all", "length": 4, "cluster": "3765", "prob": -8.267317771911621, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "pay", "id": 367, "lower": "pay", "norm": "pay", "shape": "xxx", "prefix": "p", "suffix": "pay", "length": 3, "cluster": "7946", "prob": -8.26932144165039, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "stop", "id": 368, "lower": "stop", "norm": "stop", "shape": "xxxx", "prefix": "s", "suffix": "top", "length": 4, "cluster": "3338", "prob": -8.272970199584961, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "set", "id": 369, "lower": "set", "norm": "set", "shape": "xxx", "prefix": "s", "suffix": "set", "length": 3, "cluster": "2218", "prob": -8.285635948181152, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "started", "id": 370, "lower": "started", "norm": "started", "shape": "xxxx", "prefix": "s", "suffix": "ted", "length": 7, "cluster": "3242", "prob": -8.286487579345703, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "instead", "id": 371, "lower": "instead", "norm": "instead", "shape": "xxxx", "prefix": "i", "suffix": "ead", "length": 7, "cluster": "2005", "prob": -8.292781829833984, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "story", "id": 372, "lower": "story", "norm": "story", "shape": "xxxx", "prefix": "s", "suffix": "ory", "length": 5, "cluster": "6853", "prob": -8.293317794799805, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "level", "id": 373, "lower": "level", "norm": "level", "shape": "xxxx", "prefix": "l", "suffix": "vel", "length": 5, "cluster": "6117", "prob": -8.29642391204834, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "left", "id": 374, "lower": "left", "norm": "left", "shape": "xxxx", "prefix": "l", "suffix": "eft", "length": 4, "cluster": "54954", "prob": -8.296669006347656, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "week", "id": 375, "lower": "week", "norm": "week", "shape": "xxxx", "prefix": "w", "suffix": "eek", "length": 4, "cluster": "157", "prob": -8.300933837890625, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "system", "id": 376, "lower": "system", "norm": "system", "shape": "xxxx", "prefix": "s", "suffix": "tem", "length": 6, "cluster": "4901", "prob": -8.303738594055176, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "full", "id": 377, "lower": "full", "norm": "full", "shape": "xxxx", "prefix": "f", "suffix": "ull", "length": 4, "cluster": "4071", "prob": -8.303950309753418, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "rather", "id": 378, "lower": "rather", "norm": "rather", "shape": "xxxx", "prefix": "r", "suffix": "her", "length": 6, "cluster": "6698", "prob": -8.312031745910645, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "video", "id": 379, "lower": "video", "norm": "video", "shape": "xxxx", "prefix": "v", "suffix": "deo", "length": 5, "cluster": "1975", "prob": -8.316000938415527, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "home", "id": 380, "lower": "home", "norm": "home", "shape": "xxxx", "prefix": "h", "suffix": "ome", "length": 4, "cluster": "1013", "prob": -8.316133499145508, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "women", "id": 381, "lower": "women", "norm": "women", "shape": "xxxx", "prefix": "w", "suffix": "men", "length": 5, "cluster": "877", "prob": -8.317564964294434, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "usually", "id": 382, "lower": "usually", "norm": "usually", "shape": "xxxx", "prefix": "u", "suffix": "lly", "length": 7, "cluster": "3706", "prob": -8.324220657348633, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "side", "id": 383, "lower": "side", "norm": "side", "shape": "xxxx", "prefix": "s", "suffix": "ide", "length": 4, "cluster": "8037", "prob": -8.327798843383789, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "wanted", "id": 384, "lower": "wanted", "norm": "wanted", "shape": "xxxx", "prefix": "w", "suffix": "ted", "length": 6, "cluster": "30634", "prob": -8.329934120178223, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "sense", "id": 385, "lower": "sense", "norm": "sense", "shape": "xxxx", "prefix": "s", "suffix": "nse", "length": 5, "cluster": "613", "prob": -8.338400840759277, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Your", "id": 386, "lower": "your", "norm": "Your", "shape": "Xxxx", "prefix": "Y", "suffix": "our", "length": 4, "cluster": "94", "prob": -8.347208023071289, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "second", "id": 387, "lower": "second", "norm": "second", "shape": "xxxx", "prefix": "s", "suffix": "ond", "length": 6, "cluster": "31", "prob": -8.351142883300781, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "comment", "id": 388, "lower": "comment", "norm": "comment", "shape": "xxxx", "prefix": "c", "suffix": "ent", "length": 7, "cluster": "757", "prob": -8.35578727722168, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "course", "id": 389, "lower": "course", "norm": "course", "shape": "xxxx", "prefix": "c", "suffix": "rse", "length": 6, "cluster": "1009", "prob": -8.35777759552002, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "ask", "id": 390, "lower": "ask", "norm": "ask", "shape": "xxx", "prefix": "a", "suffix": "ask", "length": 3, "cluster": "1546", "prob": -8.35922622680664, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Or", "id": 391, "lower": "or", "norm": "Or", "shape": "Xx", "prefix": "O", "suffix": "Or", "length": 2, "cluster": "1726", "prob": -8.361105918884277, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "seem", "id": 392, "lower": "seem", "norm": "seem", "shape": "xxxx", "prefix": "s", "suffix": "eem", "length": 4, "cluster": "906", "prob": -8.363061904907227, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Maybe", "id": 393, "lower": "maybe", "norm": "Maybe", "shape": "Xxxxx", "prefix": "M", "suffix": "ybe", "length": 5, "cluster": "190", "prob": -8.364654541015625, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "must", "id": 394, "lower": "must", "norm": "must", "shape": "xxxx", "prefix": "m", "suffix": "ust", "length": 4, "cluster": "698", "prob": -8.365957260131836, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Then", "id": 395, "lower": "then", "norm": "Then", "shape": "Xxxx", "prefix": "T", "suffix": "hen", "length": 4, "cluster": "1726", "prob": -8.369159698486328, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "small", "id": 396, "lower": "small", "norm": "small", "shape": "xxxx", "prefix": "s", "suffix": "all", "length": 5, "cluster": "391", "prob": -8.371565818786621, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "car", "id": 397, "lower": "car", "norm": "car", "shape": "xxx", "prefix": "c", "suffix": "car", "length": 3, "cluster": "1145", "prob": -8.374984741210938, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "hate", "id": 398, "lower": "hate", "norm": "hate", "shape": "xxxx", "prefix": "h", "suffix": "ate", "length": 4, "cluster": "906", "prob": -8.380099296569824, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "came", "id": 399, "lower": "came", "norm": "came", "shape": "xxxx", "prefix": "c", "suffix": "ame", "length": 4, "cluster": "15530", "prob": -8.382718086242676, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "watch", "id": 400, "lower": "watch", "norm": "watch", "shape": "xxxx", "prefix": "w", "suffix": "tch", "length": 5, "cluster": "3765", "prob": -8.386272430419922, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "experience", "id": 401, "lower": "experience", "norm": "experience", "shape": "xxxx", "prefix": "e", "suffix": "nce", "length": 10, "cluster": "2917", "prob": -8.387101173400879, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "cool", "id": 402, "lower": "cool", "norm": "cool", "shape": "xxxx", "prefix": "c", "suffix": "ool", "length": 4, "cluster": "565", "prob": -8.393746376037598, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "matter", "id": 403, "lower": "matter", "norm": "matter", "shape": "xxxx", "prefix": "m", "suffix": "ter", "length": 6, "cluster": "4805", "prob": -8.395515441894531, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "others", "id": 404, "lower": "others", "norm": "others", "shape": "xxxx", "prefix": "o", "suffix": "ers", "length": 6, "cluster": "1901", "prob": -8.396527290344238, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "completely", "id": 405, "lower": "completely", "norm": "completely", "shape": "xxxx", "prefix": "c", "suffix": "ely", "length": 10, "cluster": "12010", "prob": -8.40324592590332, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "All", "id": 406, "lower": "all", "norm": "All", "shape": "Xxx", "prefix": "A", "suffix": "All", "length": 3, "cluster": "1214", "prob": -8.403707504272461, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "called", "id": 407, "lower": "called", "norm": "called", "shape": "xxxx", "prefix": "c", "suffix": "led", "length": 6, "cluster": "11946", "prob": -8.404229164123535, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "under", "id": 408, "lower": "under", "norm": "under", "shape": "xxxx", "prefix": "u", "suffix": "der", "length": 5, "cluster": "32764", "prob": -8.406200408935547, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "yes", "id": 409, "lower": "yes", "norm": "yes", "shape": "xxx", "prefix": "y", "suffix": "yes", "length": 3, "cluster": "15146", "prob": -8.41097354888916, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Now", "id": 410, "lower": "now", "norm": "Now", "shape": "Xxx", "prefix": "N", "suffix": "Now", "length": 3, "cluster": "1726", "prob": -8.417712211608887, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Please", "id": 411, "lower": "please", "norm": "Please", "shape": "Xxxxx", "prefix": "P", "suffix": "ase", "length": 6, "cluster": "3582", "prob": -8.41897964477539, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "worth", "id": 412, "lower": "worth", "norm": "worth", "shape": "xxxx", "prefix": "w", "suffix": "rth", "length": 5, "cluster": "981", "prob": -8.423324584960938, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "says", "id": 413, "lower": "says", "norm": "says", "shape": "xxxx", "prefix": "s", "suffix": "ays", "length": 4, "cluster": "244", "prob": -8.426565170288086, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "comes", "id": 414, "lower": "comes", "norm": "comes", "shape": "xxxx", "prefix": "c", "suffix": "mes", "length": 5, "cluster": "15530", "prob": -8.428640365600586, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "fine", "id": 415, "lower": "fine", "norm": "fine", "shape": "xxxx", "prefix": "f", "suffix": "ine", "length": 4, "cluster": "8057", "prob": -8.428781509399414, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Thank", "id": 416, "lower": "thank", "norm": "Thank", "shape": "Xxxxx", "prefix": "T", "suffix": "ank", "length": 5, "cluster": "190", "prob": -8.434432983398438, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": " \n", "id": 0, "lower": " \n", "norm": " \n", "shape": " \n", "prefix": " ", "suffix": " \n", "length": 2, "cluster": "0", "prob": -8.435208320617676, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "works", "id": 417, "lower": "works", "norm": "works", "shape": "xxxx", "prefix": "w", "suffix": "rks", "length": 5, "cluster": "77", "prob": -8.436944961547852, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "exactly", "id": 418, "lower": "exactly", "norm": "exactly", "shape": "xxxx", "prefix": "e", "suffix": "tly", "length": 7, "cluster": "15338", "prob": -8.43747615814209, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "heard", "id": 419, "lower": "heard", "norm": "heard", "shape": "xxxx", "prefix": "h", "suffix": "ard", "length": 5, "cluster": "26282", "prob": -8.4396333694458, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "possible", "id": 420, "lower": "possible", "norm": "possible", "shape": "xxxx", "prefix": "p", "suffix": "ble", "length": 8, "cluster": "2535", "prob": -8.44277572631836, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "thinking", "id": 421, "lower": "thinking", "norm": "thinking", "shape": "xxxx", "prefix": "t", "suffix": "ing", "length": 8, "cluster": "4586", "prob": -8.442947387695312, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "hours", "id": 422, "lower": "hours", "norm": "hours", "shape": "xxxx", "prefix": "h", "suffix": "urs", "length": 5, "cluster": "957", "prob": -8.445417404174805, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "working", "id": 423, "lower": "working", "norm": "working", "shape": "xxxx", "prefix": "w", "suffix": "ing", "length": 7, "cluster": "27626", "prob": -8.44786262512207, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "took", "id": 424, "lower": "took", "norm": "took", "shape": "xxxx", "prefix": "t", "suffix": "ook", "length": 4, "cluster": "27050", "prob": -8.452874183654785, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "thanks", "id": 425, "lower": "thanks", "norm": "thanks", "shape": "xxxx", "prefix": "t", "suffix": "nks", "length": 6, "cluster": "554", "prob": -8.457283973693848, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "head", "id": 426, "lower": "head", "norm": "head", "shape": "xxxx", "prefix": "h", "suffix": "ead", "length": 4, "cluster": "1813", "prob": -8.458500862121582, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "power", "id": 427, "lower": "power", "norm": "power", "shape": "xxxx", "prefix": "p", "suffix": "wer", "length": 5, "cluster": "11621", "prob": -8.460216522216797, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "happen", "id": 428, "lower": "happen", "norm": "happen", "shape": "xxxx", "prefix": "h", "suffix": "pen", "length": 6, "cluster": "3466", "prob": -8.465093612670898, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "goes", "id": 429, "lower": "goes", "norm": "goes", "shape": "xxxx", "prefix": "g", "suffix": "oes", "length": 4, "cluster": "7338", "prob": -8.465673446655273, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Good", "id": 430, "lower": "good", "norm": "Good", "shape": "Xxxx", "prefix": "G", "suffix": "ood", "length": 4, "cluster": "614", "prob": -8.468016624450684, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "saw", "id": 431, "lower": "saw", "norm": "saw", "shape": "xxx", "prefix": "s", "suffix": "saw", "length": 3, "cluster": "6570", "prob": -8.472514152526855, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "please", "id": 432, "lower": "please", "norm": "please", "shape": "xxxx", "prefix": "p", "suffix": "ase", "length": 6, "cluster": "309", "prob": -8.473013877868652, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "couple", "id": 433, "lower": "couple", "norm": "couple", "shape": "xxxx", "prefix": "c", "suffix": "ple", "length": 6, "cluster": "853", "prob": -8.47309398651123, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "hit", "id": 434, "lower": "hit", "norm": "hit", "shape": "xxx", "prefix": "h", "suffix": "hit", "length": 3, "cluster": "682", "prob": -8.473491668701172, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "likely", "id": 435, "lower": "likely", "norm": "likely", "shape": "xxxx", "prefix": "l", "suffix": "ely", "length": 6, "cluster": "42", "prob": -8.47359561920166, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "ones", "id": 436, "lower": "ones", "norm": "ones", "shape": "xxxx", "prefix": "o", "suffix": "nes", "length": 4, "cluster": "15821", "prob": -8.474469184875488, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "often", "id": 437, "lower": "often", "norm": "often", "shape": "xxxx", "prefix": "o", "suffix": "ten", "length": 5, "cluster": "3706", "prob": -8.476237297058105, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "talk", "id": 438, "lower": "talk", "norm": "talk", "shape": "xxxx", "prefix": "t", "suffix": "alk", "length": 4, "cluster": "394", "prob": -8.479889869689941, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "issue", "id": 439, "lower": "issue", "norm": "issue", "shape": "xxxx", "prefix": "i", "suffix": "sue", "length": 5, "cluster": "3525", "prob": -8.48391342163086, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "easy", "id": 440, "lower": "easy", "norm": "easy", "shape": "xxxx", "prefix": "e", "suffix": "asy", "length": 4, "cluster": "2538", "prob": -8.489182472229004, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "One", "id": 441, "lower": "one", "norm": "One", "shape": "Xxx", "prefix": "O", "suffix": "One", "length": 3, "cluster": "350", "prob": -8.494391441345215, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "needs", "id": 442, "lower": "needs", "norm": "needs", "shape": "xxxx", "prefix": "n", "suffix": "eds", "length": 5, "cluster": "14250", "prob": -8.49528694152832, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "add", "id": 443, "lower": "add", "norm": "add", "shape": "xxx", "prefix": "a", "suffix": "add", "length": 3, "cluster": "3594", "prob": -8.496837615966797, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "support", "id": 444, "lower": "support", "norm": "support", "shape": "xxxx", "prefix": "s", "suffix": "ort", "length": 7, "cluster": "7861", "prob": -8.503355026245117, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "face", "id": 445, "lower": "face", "norm": "face", "shape": "xxxx", "prefix": "f", "suffix": "ace", "length": 4, "cluster": "1685", "prob": -8.504852294921875, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "hand", "id": 446, "lower": "hand", "norm": "hand", "shape": "xxxx", "prefix": "h", "suffix": "and", "length": 4, "cluster": "8037", "prob": -8.504961967468262, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "half", "id": 447, "lower": "half", "norm": "half", "shape": "xxxx", "prefix": "h", "suffix": "alf", "length": 4, "cluster": "469", "prob": -8.508658409118652, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "check", "id": 448, "lower": "check", "norm": "check", "shape": "xxxx", "prefix": "c", "suffix": "eck", "length": 5, "cluster": "2485", "prob": -8.512067794799805, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "night", "id": 449, "lower": "night", "norm": "night", "shape": "xxxx", "prefix": "n", "suffix": "ght", "length": 5, "cluster": "93", "prob": -8.517072677612305, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "months", "id": 450, "lower": "months", "norm": "months", "shape": "xxxx", "prefix": "m", "suffix": "ths", "length": 6, "cluster": "445", "prob": -8.517988204956055, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "kids", "id": 451, "lower": "kids", "norm": "kids", "shape": "xxxx", "prefix": "k", "suffix": "ids", "length": 4, "cluster": "877", "prob": -8.520237922668457, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "players", "id": 452, "lower": "players", "norm": "players", "shape": "xxxx", "prefix": "p", "suffix": "ers", "length": 7, "cluster": "3565", "prob": -8.520515441894531, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "line", "id": 453, "lower": "line", "norm": "line", "shape": "xxxx", "prefix": "l", "suffix": "ine", "length": 4, "cluster": "3941", "prob": -8.522600173950195, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "told", "id": 454, "lower": "told", "norm": "told", "shape": "xxxx", "prefix": "t", "suffix": "old", "length": 4, "cluster": "20138", "prob": -8.52303409576416, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "example", "id": 455, "lower": "example", "norm": "example", "shape": "xxxx", "prefix": "e", "suffix": "ple", "length": 7, "cluster": "497", "prob": -8.523116111755371, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "played", "id": 456, "lower": "played", "norm": "played", "shape": "xxxx", "prefix": "p", "suffix": "yed", "length": 6, "cluster": "32426", "prob": -8.528886795043945, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "reddit", "id": 457, "lower": "reddit", "norm": "reddit", "shape": "xxxx", "prefix": "r", "suffix": "dit", "length": 6, "cluster": "0", "prob": -8.52908992767334, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "based", "id": 458, "lower": "based", "norm": "based", "shape": "xxxx", "prefix": "b", "suffix": "sed", "length": 5, "cluster": "1578", "prob": -8.53032112121582, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "tried", "id": 459, "lower": "tried", "norm": "tried", "shape": "xxxx", "prefix": "t", "suffix": "ied", "length": 5, "cluster": "28586", "prob": -8.532145500183105, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "sounds", "id": 460, "lower": "sounds", "norm": "sounds", "shape": "xxxx", "prefix": "s", "suffix": "nds", "length": 6, "cluster": "2442", "prob": -8.53985595703125, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "link", "id": 461, "lower": "link", "norm": "link", "shape": "xxxx", "prefix": "l", "suffix": "ink", "length": 4, "cluster": "5829", "prob": -8.540618896484375, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "girl", "id": 462, "lower": "girl", "norm": "girl", "shape": "xxxx", "prefix": "g", "suffix": "irl", "length": 4, "cluster": "549", "prob": -8.542597770690918, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "open", "id": 463, "lower": "open", "norm": "open", "shape": "xxxx", "prefix": "o", "suffix": "pen", "length": 4, "cluster": "1589", "prob": -8.553583145141602, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "To", "id": 464, "lower": "to", "norm": "To", "shape": "Xx", "prefix": "T", "suffix": "To", "length": 2, "cluster": "3582", "prob": -8.557126998901367, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "taking", "id": 465, "lower": "taking", "norm": "taking", "shape": "xxxx", "prefix": "t", "suffix": "ing", "length": 6, "cluster": "31722", "prob": -8.55748462677002, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "happened", "id": 466, "lower": "happened", "norm": "happened", "shape": "xxxx", "prefix": "h", "suffix": "ned", "length": 8, "cluster": "5290", "prob": -8.559469223022461, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "during", "id": 467, "lower": "during", "norm": "during", "shape": "xxxx", "prefix": "d", "suffix": "ing", "length": 6, "cluster": "262140", "prob": -8.559581756591797, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "deal", "id": 468, "lower": "deal", "norm": "deal", "shape": "xxxx", "prefix": "d", "suffix": "eal", "length": 4, "cluster": "5829", "prob": -8.560197830200195, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "single", "id": 469, "lower": "single", "norm": "single", "shape": "xxxx", "prefix": "s", "suffix": "gle", "length": 6, "cluster": "71", "prob": -8.571329116821289, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "family", "id": 470, "lower": "family", "norm": "family", "shape": "xxxx", "prefix": "f", "suffix": "ily", "length": 6, "cluster": "1061", "prob": -8.571907043457031, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "close", "id": 471, "lower": "close", "norm": "close", "shape": "xxxx", "prefix": "c", "suffix": "ose", "length": 5, "cluster": "53", "prob": -8.581155776977539, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "happy", "id": 472, "lower": "happy", "norm": "happy", "shape": "xxxx", "prefix": "h", "suffix": "ppy", "length": 5, "cluster": "4586", "prob": -8.581560134887695, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "move", "id": 473, "lower": "move", "norm": "move", "shape": "xxxx", "prefix": "m", "suffix": "ove", "length": 4, "cluster": "7093", "prob": -8.582797050476074, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "number", "id": 474, "lower": "number", "norm": "number", "shape": "xxxx", "prefix": "n", "suffix": "ber", "length": 6, "cluster": "341", "prob": -8.584420204162598, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "US", "id": 475, "lower": "us", "norm": "US", "shape": "XX", "prefix": "U", "suffix": "US", "length": 2, "cluster": "1642", "prob": -8.585862159729004, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": true, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "water", "id": 476, "lower": "water", "norm": "water", "shape": "xxxx", "prefix": "w", "suffix": "ter", "length": 5, "cluster": "3705", "prob": -8.589462280273438, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "men", "id": 477, "lower": "men", "norm": "men", "shape": "xxx", "prefix": "m", "suffix": "men", "length": 3, "cluster": "877", "prob": -8.59007453918457, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "yeah", "id": 478, "lower": "yeah", "norm": "yeah", "shape": "xxxx", "prefix": "y", "suffix": "eah", "length": 4, "cluster": "26", "prob": -8.593489646911621, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "later", "id": 479, "lower": "later", "norm": "later", "shape": "xxxx", "prefix": "l", "suffix": "ter", "length": 5, "cluster": "5866", "prob": -8.603795051574707, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "whatever", "id": 480, "lower": "whatever", "norm": "whatever", "shape": "xxxx", "prefix": "w", "suffix": "ver", "length": 8, "cluster": "2026", "prob": -8.610091209411621, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "government", "id": 481, "lower": "government", "norm": "government", "shape": "xxxx", "prefix": "g", "suffix": "ent", "length": 10, "cluster": "297", "prob": -8.610445022583008, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "6", "id": 482, "lower": "6", "norm": "6", "shape": "d", "prefix": "6", "suffix": "6", "length": 1, "cluster": "50", "prob": -8.611133575439453, "is_alpha": false, "is_ascii": true, "is_digit": true, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "house", "id": 483, "lower": "house", "norm": "house", "shape": "xxxx", "prefix": "h", "suffix": "use", "length": 5, "cluster": "37", "prob": -8.613367080688477, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "similar", "id": 484, "lower": "similar", "norm": "similar", "shape": "xxxx", "prefix": "s", "suffix": "lar", "length": 7, "cluster": "295", "prob": -8.613471031188965, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "wait", "id": 485, "lower": "wait", "norm": "wait", "shape": "xxxx", "prefix": "w", "suffix": "ait", "length": 4, "cluster": "3765", "prob": -8.613734245300293, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "questions", "id": 486, "lower": "questions", "norm": "questions", "shape": "xxxx", "prefix": "q", "suffix": "ons", "length": 9, "cluster": "1165", "prob": -8.613752365112305, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "sex", "id": 487, "lower": "sex", "norm": "sex", "shape": "xxx", "prefix": "s", "suffix": "sex", "length": 3, "cluster": "633", "prob": -8.613862991333008, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "especially", "id": 488, "lower": "especially", "norm": "especially", "shape": "xxxx", "prefix": "e", "suffix": "lly", "length": 10, "cluster": "27882", "prob": -8.616527557373047, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "lol", "id": 489, "lower": "lol", "norm": "lol", "shape": "xxx", "prefix": "l", "suffix": "lol", "length": 3, "cluster": "0", "prob": -8.621257781982422, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "Because", "id": 490, "lower": "because", "norm": "Because", "shape": "Xxxxx", "prefix": "B", "suffix": "use", "length": 7, "cluster": "1214", "prob": -8.623008728027344, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "God", "id": 491, "lower": "god", "norm": "God", "shape": "Xxx", "prefix": "G", "suffix": "God", "length": 3, "cluster": "422", "prob": -8.62376594543457, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
diff --git a/website/api/_annotation/_training.jade b/website/api/_annotation/_training.jade
index d05bfa825..9a5e96628 100644
--- a/website/api/_annotation/_training.jade
+++ b/website/api/_annotation/_training.jade
@@ -1,5 +1,7 @@
 //- 💫 DOCS > API > ANNOTATION > TRAINING
 
++h(3, "json-input") JSON input format for training
+
 p
     |  spaCy takes training data in JSON format. The built-in
     |  #[+api("cli#convert") #[code convert]] command helps you convert the
@@ -46,3 +48,57 @@ p
     |  Treebank:
 
 +github("spacy", "examples/training/training-data.json", false, false, "json")
+
++h(3, "vocab-jsonl") Lexical data for vocabulary
+    +tag-new(2)
+
+p
+    |  The populate a model's vocabulary, you can use the
+    |  #[+api("cli#vocab") #[code spacy vocab]] command and load in a
+    |  #[+a("https://jsonlines.readthedocs.io/en/latest/") newline-delimited JSON]
+    |  (JSONL) file containing one lexical entry per line. The first line
+    |  defines the language and vocabulary settings. All other lines are
+    |  expected to be JSON objects describing an individual lexeme. The lexical
+    |  attributes will be then set as attributes on spaCy's
+    |  #[+api("lexeme#attributes") #[code Lexeme]] object. The #[code vocab]
+    |  command outputs a ready-to-use spaCy model with a #[code Vocab]
+    |  containing the lexical data.
+
++code("First line").
+    {"lang": "en", "settings": {"oov_prob": -20.502029418945312}}
+
++code("Entry structure").
+    {
+        "orth": string,
+        "id": int,
+        "lower": string,
+        "norm": string,
+        "shape": string
+        "prefix": string,
+        "suffix": string,
+        "length": int,
+        "cluster": string,
+        "prob": float,
+        "is_alpha": bool,
+        "is_ascii": bool,
+        "is_digit": bool,
+        "is_lower": bool,
+        "is_punct": bool,
+        "is_space": bool,
+        "is_title": bool,
+        "is_upper": bool,
+        "like_url": bool,
+        "like_num": bool,
+        "like_email": bool,
+        "is_stop": bool,
+        "is_oov": bool,
+        "is_quote": bool,
+        "is_left_punct": bool,
+        "is_right_punct": bool
+    }
+
+p
+    |  Here's an example of the 500 most frequent lexemes in the English
+    |  training data:
+
++github("spacy", "examples/training/vocab-data.json", false, false, "json")
diff --git a/website/api/_data.json b/website/api/_data.json
index 0be09b782..886404c99 100644
--- a/website/api/_data.json
+++ b/website/api/_data.json
@@ -220,7 +220,7 @@
             "Lemmatization": "lemmatization",
             "Dependencies": "dependency-parsing",
             "Named Entities": "named-entities",
-            "Training Data": "training"
+            "Models & Training": "training"
         }
     }
 }
diff --git a/website/api/annotation.jade b/website/api/annotation.jade
index c65cd3983..16598371d 100644
--- a/website/api/annotation.jade
+++ b/website/api/annotation.jade
@@ -99,6 +99,6 @@ p This document describes the target annotations spaCy is trained to predict.
     include _annotation/_biluo
 
 +section("training")
-    +h(2, "json-input") JSON input format for training
+    +h(2, "training") Models and training data
 
     include _annotation/_training

From f02b0af821ab7f82dbc4cf42e4f2ed0d273d230a Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 19:44:35 +0100
Subject: [PATCH 33/90] Fix path and use smaller example size

500 was too larger and caused laggy rendering
---
 examples/training/vocab-data.jsonl     | 399 -------------------------
 website/api/_annotation/_training.jade |   4 +-
 2 files changed, 2 insertions(+), 401 deletions(-)

diff --git a/examples/training/vocab-data.jsonl b/examples/training/vocab-data.jsonl
index 4fae8fd65..3fdf5eede 100644
--- a/examples/training/vocab-data.jsonl
+++ b/examples/training/vocab-data.jsonl
@@ -99,402 +99,3 @@
 {"orth": ";", "id": 95, "lower": ";", "norm": ";", "shape": ";", "prefix": ";", "suffix": ";", "length": 1, "cluster": "36", "prob": -6.586422920227051, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
 {"orth": "'ve", "id": 96, "lower": "'ve", "norm": "'ve", "shape": "'xx", "prefix": "'", "suffix": "'ve", "length": 3, "cluster": "1018", "prob": -6.593011379241943, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
 {"orth": "could", "id": 97, "lower": "could", "norm": "could", "shape": "xxxx", "prefix": "c", "suffix": "uld", "length": 5, "cluster": "954", "prob": -6.595959186553955, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "then", "id": 98, "lower": "then", "norm": "then", "shape": "xxxx", "prefix": "t", "suffix": "hen", "length": 4, "cluster": "9962", "prob": -6.598200798034668, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "other", "id": 99, "lower": "other", "norm": "other", "shape": "xxxx", "prefix": "o", "suffix": "her", "length": 5, "cluster": "47", "prob": -6.6438727378845215, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "make", "id": 100, "lower": "make", "norm": "make", "shape": "xxxx", "prefix": "m", "suffix": "ake", "length": 4, "cluster": "4618", "prob": -6.66980504989624, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "been", "id": 101, "lower": "been", "norm": "been", "shape": "xxxx", "prefix": "b", "suffix": "een", "length": 4, "cluster": "202", "prob": -6.670916557312012, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "were", "id": 102, "lower": "were", "norm": "were", "shape": "xxxx", "prefix": "w", "suffix": "ere", "length": 4, "cluster": "506", "prob": -6.673174858093262, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "see", "id": 103, "lower": "see", "norm": "see", "shape": "xxx", "prefix": "s", "suffix": "see", "length": 3, "cluster": "1546", "prob": -6.6828837394714355, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "That", "id": 104, "lower": "that", "norm": "That", "shape": "Xxxx", "prefix": "T", "suffix": "hat", "length": 4, "cluster": "1406", "prob": -6.688080310821533, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "i", "id": 105, "lower": "i", "norm": "i", "shape": "x", "prefix": "i", "suffix": "i", "length": 1, "cluster": "966", "prob": -6.6887712478637695, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "any", "id": 106, "lower": "any", "norm": "any", "shape": "xxx", "prefix": "a", "suffix": "any", "length": 3, "cluster": "12266", "prob": -6.689523220062256, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "want", "id": 107, "lower": "want", "norm": "want", "shape": "xxxx", "prefix": "w", "suffix": "ant", "length": 4, "cluster": "906", "prob": -6.694204807281494, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "even", "id": 108, "lower": "even", "norm": "even", "shape": "xxxx", "prefix": "e", "suffix": "ven", "length": 4, "cluster": "3306", "prob": -6.702912330627441, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "should", "id": 109, "lower": "should", "norm": "should", "shape": "xxxx", "prefix": "s", "suffix": "uld", "length": 6, "cluster": "698", "prob": -6.733259677886963, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "way", "id": 110, "lower": "way", "norm": "way", "shape": "xxx", "prefix": "w", "suffix": "way", "length": 3, "cluster": "1349", "prob": -6.73627233505249, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "'", "id": 111, "lower": "'", "norm": "'", "shape": "'", "prefix": "'", "suffix": "'", "length": 1, "cluster": "916", "prob": -6.73720121383667, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": true, "is_left_punct": true, "is_right_punct": true}
-{"orth": "too", "id": 112, "lower": "too", "norm": "too", "shape": "xxx", "prefix": "t", "suffix": "too", "length": 3, "cluster": "6378", "prob": -6.77581787109375, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "go", "id": 113, "lower": "go", "norm": "go", "shape": "xx", "prefix": "g", "suffix": "go", "length": 2, "cluster": "3466", "prob": -6.775965213775635, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "him", "id": 114, "lower": "him", "norm": "him", "shape": "xxx", "prefix": "h", "suffix": "him", "length": 3, "cluster": "1898", "prob": -6.783067226409912, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "This", "id": 115, "lower": "this", "norm": "This", "shape": "Xxxx", "prefix": "T", "suffix": "his", "length": 4, "cluster": "382", "prob": -6.78391695022583, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "her", "id": 116, "lower": "her", "norm": "her", "shape": "xxx", "prefix": "h", "suffix": "her", "length": 3, "cluster": "507", "prob": -6.798486709594727, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "going", "id": 117, "lower": "going", "norm": "going", "shape": "xxxx", "prefix": "g", "suffix": "ing", "length": 5, "cluster": "2090", "prob": -6.833367824554443, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "now", "id": 118, "lower": "now", "norm": "now", "shape": "xxx", "prefix": "n", "suffix": "now", "length": 3, "cluster": "1770", "prob": -6.834407329559326, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "being", "id": 119, "lower": "being", "norm": "being", "shape": "xxxx", "prefix": "b", "suffix": "ing", "length": 5, "cluster": "3818", "prob": -6.845808029174805, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "still", "id": 120, "lower": "still", "norm": "still", "shape": "xxxx", "prefix": "s", "suffix": "ill", "length": 5, "cluster": "1658", "prob": -6.867525100708008, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "into", "id": 121, "lower": "into", "norm": "into", "shape": "xxxx", "prefix": "i", "suffix": "nto", "length": 4, "cluster": "8188", "prob": -6.87359094619751, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "which", "id": 122, "lower": "which", "norm": "which", "shape": "xxxx", "prefix": "w", "suffix": "ich", "length": 5, "cluster": "154", "prob": -6.877470970153809, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "something", "id": 123, "lower": "something", "norm": "something", "shape": "xxxx", "prefix": "s", "suffix": "ing", "length": 9, "cluster": "14314", "prob": -6.887354850769043, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "she", "id": 124, "lower": "she", "norm": "she", "shape": "xxx", "prefix": "s", "suffix": "she", "length": 3, "cluster": "218", "prob": -6.90155553817749, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "also", "id": 125, "lower": "also", "norm": "also", "shape": "xxxx", "prefix": "a", "suffix": "lso", "length": 4, "cluster": "122", "prob": -6.928974151611328, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "very", "id": 126, "lower": "very", "norm": "very", "shape": "xxxx", "prefix": "v", "suffix": "ery", "length": 4, "cluster": "234", "prob": -6.93242883682251, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "right", "id": 127, "lower": "right", "norm": "right", "shape": "xxxx", "prefix": "r", "suffix": "ght", "length": 5, "cluster": "14122", "prob": -6.933711051940918, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "game", "id": 128, "lower": "game", "norm": "game", "shape": "xxxx", "prefix": "g", "suffix": "ame", "length": 4, "cluster": "7973", "prob": -6.940612316131592, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "say", "id": 129, "lower": "say", "norm": "say", "shape": "xxx", "prefix": "s", "suffix": "say", "length": 3, "cluster": "1162", "prob": -6.950479984283447, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "'ll", "id": 130, "lower": "'ll", "norm": "'ll", "shape": "'xx", "prefix": "'", "suffix": "'ll", "length": 3, "cluster": "5114", "prob": -6.958071231842041, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "got", "id": 131, "lower": "got", "norm": "got", "shape": "xxx", "prefix": "g", "suffix": "got", "length": 3, "cluster": "10666", "prob": -6.98855447769165, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "well", "id": 132, "lower": "well", "norm": "well", "shape": "xxxx", "prefix": "w", "suffix": "ell", "length": 4, "cluster": "746", "prob": -6.995903968811035, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "need", "id": 133, "lower": "need", "norm": "need", "shape": "xxxx", "prefix": "n", "suffix": "eed", "length": 4, "cluster": "2954", "prob": -7.008103370666504, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "And", "id": 134, "lower": "and", "norm": "And", "shape": "Xxx", "prefix": "A", "suffix": "And", "length": 3, "cluster": "1470", "prob": -7.012199401855469, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "But", "id": 135, "lower": "but", "norm": "But", "shape": "Xxx", "prefix": "B", "suffix": "But", "length": 3, "cluster": "1470", "prob": -7.0142974853515625, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "over", "id": 136, "lower": "over", "norm": "over", "shape": "xxxx", "prefix": "o", "suffix": "ver", "length": 4, "cluster": "49148", "prob": -7.027544975280762, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "back", "id": 137, "lower": "back", "norm": "back", "shape": "xxxx", "prefix": "b", "suffix": "ack", "length": 4, "cluster": "7530", "prob": -7.033305644989014, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "same", "id": 138, "lower": "same", "norm": "same", "shape": "xxxx", "prefix": "s", "suffix": "ame", "length": 4, "cluster": "991", "prob": -7.053191661834717, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "thing", "id": 139, "lower": "thing", "norm": "thing", "shape": "xxxx", "prefix": "t", "suffix": "ing", "length": 5, "cluster": "2013", "prob": -7.063167572021484, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "first", "id": 140, "lower": "first", "norm": "first", "shape": "xxxx", "prefix": "f", "suffix": "rst", "length": 5, "cluster": "159", "prob": -7.063716888427734, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "most", "id": 141, "lower": "most", "norm": "most", "shape": "xxxx", "prefix": "m", "suffix": "ost", "length": 4, "cluster": "175", "prob": -7.0663957595825195, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "here", "id": 142, "lower": "here", "norm": "here", "shape": "xxxx", "prefix": "h", "suffix": "ere", "length": 4, "cluster": "3946", "prob": -7.0680251121521, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "ca", "id": 143, "lower": "ca", "norm": "ca", "shape": "xx", "prefix": "c", "suffix": "ca", "length": 2, "cluster": "0", "prob": -7.071251392364502, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "off", "id": 144, "lower": "off", "norm": "off", "shape": "xxx", "prefix": "o", "suffix": "off", "length": 3, "cluster": "6506", "prob": -7.073742389678955, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "'d", "id": 145, "lower": "'d", "norm": "'d", "shape": "'x", "prefix": "'", "suffix": "'d", "length": 2, "cluster": "5114", "prob": -7.075286865234375, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "They", "id": 146, "lower": "they", "norm": "They", "shape": "Xxxx", "prefix": "T", "suffix": "hey", "length": 4, "cluster": "1882", "prob": -7.0789008140563965, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "work", "id": 147, "lower": "work", "norm": "work", "shape": "xxxx", "prefix": "w", "suffix": "ork", "length": 4, "cluster": "1973", "prob": -7.081293106079102, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "use", "id": 148, "lower": "use", "norm": "use", "shape": "xxx", "prefix": "u", "suffix": "use", "length": 3, "cluster": "2741", "prob": -7.083596229553223, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "never", "id": 149, "lower": "never", "norm": "never", "shape": "xxxx", "prefix": "n", "suffix": "ver", "length": 5, "cluster": "15994", "prob": -7.084620475769043, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "better", "id": 150, "lower": "better", "norm": "better", "shape": "xxxx", "prefix": "b", "suffix": "ter", "length": 6, "cluster": "7658", "prob": -7.1072587966918945, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "though", "id": 151, "lower": "though", "norm": "though", "shape": "xxxx", "prefix": "t", "suffix": "ugh", "length": 6, "cluster": "2004", "prob": -7.113335132598877, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "lot", "id": 152, "lower": "lot", "norm": "lot", "shape": "xxx", "prefix": "l", "suffix": "lot", "length": 3, "cluster": "853", "prob": -7.113600254058838, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "pretty", "id": 153, "lower": "pretty", "norm": "pretty", "shape": "xxxx", "prefix": "p", "suffix": "tty", "length": 6, "cluster": "234", "prob": -7.1256103515625, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "where", "id": 154, "lower": "where", "norm": "where", "shape": "xxxx", "prefix": "w", "suffix": "ere", "length": 5, "cluster": "8148", "prob": -7.146170139312744, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "am", "id": 155, "lower": "am", "norm": "am", "shape": "xx", "prefix": "a", "suffix": "am", "length": 2, "cluster": "3066", "prob": -7.149725437164307, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "things", "id": 156, "lower": "things", "norm": "things", "shape": "xxxx", "prefix": "t", "suffix": "ngs", "length": 6, "cluster": "3917", "prob": -7.154941082000732, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "sure", "id": 157, "lower": "sure", "norm": "sure", "shape": "xxxx", "prefix": "s", "suffix": "ure", "length": 4, "cluster": "490", "prob": -7.157395839691162, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "actually", "id": 158, "lower": "actually", "norm": "actually", "shape": "xxxx", "prefix": "a", "suffix": "lly", "length": 8, "cluster": "7802", "prob": -7.160778045654297, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "He", "id": 159, "lower": "he", "norm": "He", "shape": "Xx", "prefix": "H", "suffix": "He", "length": 2, "cluster": "126", "prob": -7.162238121032715, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "those", "id": 160, "lower": "those", "norm": "those", "shape": "xxxx", "prefix": "t", "suffix": "ose", "length": 5, "cluster": "495", "prob": -7.169255256652832, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "why", "id": 161, "lower": "why", "norm": "why", "shape": "xxx", "prefix": "w", "suffix": "why", "length": 3, "cluster": "18410", "prob": -7.178915500640869, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "So", "id": 162, "lower": "so", "norm": "So", "shape": "Xx", "prefix": "S", "suffix": "So", "length": 2, "cluster": "1726", "prob": -7.199381351470947, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "take", "id": 163, "lower": "take", "norm": "take", "shape": "xxxx", "prefix": "t", "suffix": "ake", "length": 4, "cluster": "6666", "prob": -7.209812641143799, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "down", "id": 164, "lower": "down", "norm": "down", "shape": "xxxx", "prefix": "d", "suffix": "own", "length": 4, "cluster": "2410", "prob": -7.223586082458496, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "What", "id": 165, "lower": "what", "norm": "What", "shape": "Xxxx", "prefix": "W", "suffix": "hat", "length": 4, "cluster": "702", "prob": -7.226758003234863, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "someone", "id": 166, "lower": "someone", "norm": "someone", "shape": "xxxx", "prefix": "s", "suffix": "one", "length": 7, "cluster": "30698", "prob": -7.249640464782715, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "before", "id": 167, "lower": "before", "norm": "before", "shape": "xxxx", "prefix": "b", "suffix": "ore", "length": 6, "cluster": "1492", "prob": -7.253359794616699, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "said", "id": 168, "lower": "said", "norm": "said", "shape": "xxxx", "prefix": "s", "suffix": "aid", "length": 4, "cluster": "116", "prob": -7.258025169372559, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "after", "id": 169, "lower": "after", "norm": "after", "shape": "xxxx", "prefix": "a", "suffix": "ter", "length": 5, "cluster": "3540", "prob": -7.265651702880859, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "around", "id": 170, "lower": "around", "norm": "around", "shape": "xxxx", "prefix": "a", "suffix": "und", "length": 6, "cluster": "245756", "prob": -7.313362121582031, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "its", "id": 171, "lower": "its", "norm": "its", "shape": "xxx", "prefix": "i", "suffix": "its", "length": 3, "cluster": "27", "prob": -7.321457862854004, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "2", "id": 172, "lower": "2", "norm": "2", "shape": "d", "prefix": "2", "suffix": "2", "length": 1, "cluster": "818", "prob": -7.324268341064453, "is_alpha": false, "is_ascii": true, "is_digit": true, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "feel", "id": 173, "lower": "feel", "norm": "feel", "shape": "xxxx", "prefix": "f", "suffix": "eel", "length": 4, "cluster": "1674", "prob": -7.342533588409424, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "My", "id": 174, "lower": "my", "norm": "My", "shape": "Xx", "prefix": "M", "suffix": "My", "length": 2, "cluster": "94", "prob": -7.345071792602539, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "There", "id": 175, "lower": "there", "norm": "There", "shape": "Xxxxx", "prefix": "T", "suffix": "ere", "length": 5, "cluster": "1918", "prob": -7.347356796264648, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "look", "id": 176, "lower": "look", "norm": "look", "shape": "xxxx", "prefix": "l", "suffix": "ook", "length": 4, "cluster": "2442", "prob": -7.352481365203857, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "these", "id": 177, "lower": "these", "norm": "these", "shape": "xxxx", "prefix": "t", "suffix": "ese", "length": 5, "cluster": "1519", "prob": -7.36269474029541, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "years", "id": 178, "lower": "years", "norm": "years", "shape": "xxxx", "prefix": "y", "suffix": "ars", "length": 5, "cluster": "189", "prob": -7.368987560272217, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "love", "id": 179, "lower": "love", "norm": "love", "shape": "xxxx", "prefix": "l", "suffix": "ove", "length": 4, "cluster": "2661", "prob": -7.372685432434082, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "always", "id": 180, "lower": "always", "norm": "always", "shape": "xxxx", "prefix": "a", "suffix": "ays", "length": 6, "cluster": "15994", "prob": -7.37296724319458, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "many", "id": 181, "lower": "many", "norm": "many", "shape": "xxxx", "prefix": "m", "suffix": "any", "length": 4, "cluster": "751", "prob": -7.377613067626953, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "&gt", "id": 0, "lower": "&gt", "norm": "&gt", "shape": "&xx", "prefix": "&", "suffix": "&gt", "length": 3, "cluster": "0", "prob": -7.38146448135376, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "A", "id": 182, "lower": "a", "norm": "A", "shape": "X", "prefix": "A", "suffix": "A", "length": 1, "cluster": "222", "prob": -7.38541841506958, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": true, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "point", "id": 183, "lower": "point", "norm": "point", "shape": "xxxx", "prefix": "p", "suffix": "int", "length": 5, "cluster": "389", "prob": -7.386973857879639, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "find", "id": 184, "lower": "find", "norm": "find", "shape": "xxxx", "prefix": "f", "suffix": "ind", "length": 4, "cluster": "5642", "prob": -7.387212753295898, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "probably", "id": 185, "lower": "probably", "norm": "probably", "shape": "xxxx", "prefix": "p", "suffix": "bly", "length": 8, "cluster": "5754", "prob": -7.395048141479492, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "new", "id": 186, "lower": "new", "norm": "new", "shape": "xxx", "prefix": "n", "suffix": "new", "length": 3, "cluster": "199", "prob": -7.398182392120361, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "made", "id": 187, "lower": "made", "norm": "made", "shape": "xxxx", "prefix": "m", "suffix": "ade", "length": 4, "cluster": "120490", "prob": -7.399899005889893, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "day", "id": 188, "lower": "day", "norm": "day", "shape": "xxx", "prefix": "d", "suffix": "day", "length": 3, "cluster": "989", "prob": -7.400947093963623, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "We", "id": 189, "lower": "we", "norm": "We", "shape": "Xx", "prefix": "W", "suffix": "We", "length": 2, "cluster": "858", "prob": -7.402578353881836, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "every", "id": 190, "lower": "every", "norm": "every", "shape": "xxxx", "prefix": "e", "suffix": "ery", "length": 5, "cluster": "61418", "prob": -7.414647579193115, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "great", "id": 191, "lower": "great", "norm": "great", "shape": "xxxx", "prefix": "g", "suffix": "eat", "length": 5, "cluster": "1831", "prob": -7.420454502105713, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "our", "id": 192, "lower": "our", "norm": "our", "shape": "xxx", "prefix": "o", "suffix": "our", "length": 3, "cluster": "59", "prob": -7.4210286140441895, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "two", "id": 193, "lower": "two", "norm": "two", "shape": "xxx", "prefix": "t", "suffix": "two", "length": 3, "cluster": "15", "prob": -7.433600425720215, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "anything", "id": 194, "lower": "anything", "norm": "anything", "shape": "xxxx", "prefix": "a", "suffix": "ing", "length": 8, "cluster": "14314", "prob": -7.439383506774902, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "while", "id": 195, "lower": "while", "norm": "while", "shape": "xxxx", "prefix": "w", "suffix": "ile", "length": 5, "cluster": "6100", "prob": -7.440170764923096, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "few", "id": 196, "lower": "few", "norm": "few", "shape": "xxx", "prefix": "f", "suffix": "few", "length": 3, "cluster": "79", "prob": -7.440912246704102, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "$", "id": 197, "lower": "$", "norm": "$", "shape": "$", "prefix": "$", "suffix": "$", "length": 1, "cluster": "18", "prob": -7.450106620788574, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "bad", "id": 198, "lower": "bad", "norm": "bad", "shape": "xxx", "prefix": "b", "suffix": "bad", "length": 3, "cluster": "551", "prob": -7.452563762664795, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "No", "id": 199, "lower": "no", "norm": "No", "shape": "Xx", "prefix": "N", "suffix": "No", "length": 2, "cluster": "94", "prob": -7.456389427185059, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "little", "id": 200, "lower": "little", "norm": "little", "shape": "xxxx", "prefix": "l", "suffix": "tle", "length": 6, "cluster": "1959", "prob": -7.480203628540039, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "might", "id": 201, "lower": "might", "norm": "might", "shape": "xxxx", "prefix": "m", "suffix": "ght", "length": 5, "cluster": "186", "prob": -7.490107536315918, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "best", "id": 202, "lower": "best", "norm": "best", "shape": "xxxx", "prefix": "b", "suffix": "est", "length": 4, "cluster": "479", "prob": -7.492556571960449, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "play", "id": 203, "lower": "play", "norm": "play", "shape": "xxxx", "prefix": "p", "suffix": "lay", "length": 4, "cluster": "1717", "prob": -7.50220251083374, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "shit", "id": 204, "lower": "shit", "norm": "shit", "shape": "xxxx", "prefix": "s", "suffix": "hit", "length": 4, "cluster": "0", "prob": -7.522359371185303, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "try", "id": 205, "lower": "try", "norm": "try", "shape": "xxx", "prefix": "t", "suffix": "try", "length": 3, "cluster": "1930", "prob": -7.540920734405518, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "used", "id": 206, "lower": "used", "norm": "used", "shape": "xxxx", "prefix": "u", "suffix": "sed", "length": 4, "cluster": "15402", "prob": -7.542972087860107, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "long", "id": 207, "lower": "long", "norm": "long", "shape": "xxxx", "prefix": "l", "suffix": "ong", "length": 4, "cluster": "935", "prob": -7.544892311096191, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "doing", "id": 208, "lower": "doing", "norm": "doing", "shape": "xxxx", "prefix": "d", "suffix": "ing", "length": 5, "cluster": "15338", "prob": -7.553442478179932, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "getting", "id": 209, "lower": "getting", "norm": "getting", "shape": "xxxx", "prefix": "g", "suffix": "ing", "length": 7, "cluster": "31722", "prob": -7.564762115478516, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "post", "id": 210, "lower": "post", "norm": "post", "shape": "xxxx", "prefix": "p", "suffix": "ost", "length": 4, "cluster": "3733", "prob": -7.565684795379639, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "year", "id": 211, "lower": "year", "norm": "year", "shape": "xxxx", "prefix": "y", "suffix": "ear", "length": 4, "cluster": "29", "prob": -7.567681312561035, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Do", "id": 212, "lower": "do", "norm": "Do", "shape": "Xx", "prefix": "D", "suffix": "Do", "length": 2, "cluster": "702", "prob": -7.570033073425293, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "life", "id": 213, "lower": "life", "norm": "life", "shape": "xxxx", "prefix": "l", "suffix": "ife", "length": 4, "cluster": "1893", "prob": -7.574200630187988, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "through", "id": 214, "lower": "through", "norm": "through", "shape": "xxxx", "prefix": "t", "suffix": "ugh", "length": 7, "cluster": "65532", "prob": -7.575429439544678, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "guy", "id": 215, "lower": "guy", "norm": "guy", "shape": "xxx", "prefix": "g", "suffix": "guy", "length": 3, "cluster": "549", "prob": -7.582011699676514, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "enough", "id": 216, "lower": "enough", "norm": "enough", "shape": "xxxx", "prefix": "e", "suffix": "ugh", "length": 6, "cluster": "1834", "prob": -7.586349010467529, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "ever", "id": 217, "lower": "ever", "norm": "ever", "shape": "xxxx", "prefix": "e", "suffix": "ver", "length": 4, "cluster": "14058", "prob": -7.591183662414551, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "In", "id": 218, "lower": "in", "norm": "In", "shape": "Xx", "prefix": "I", "suffix": "In", "length": 2, "cluster": "62", "prob": -7.603263854980469, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "give", "id": 219, "lower": "give", "norm": "give", "shape": "xxxx", "prefix": "g", "suffix": "ive", "length": 4, "cluster": "522", "prob": -7.611863136291504, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "mean", "id": 220, "lower": "mean", "norm": "mean", "shape": "xxxx", "prefix": "m", "suffix": "ean", "length": 4, "cluster": "3082", "prob": -7.611870765686035, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "thought", "id": 221, "lower": "thought", "norm": "thought", "shape": "xxxx", "prefix": "t", "suffix": "ght", "length": 7, "cluster": "650", "prob": -7.614910125732422, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "since", "id": 222, "lower": "since", "norm": "since", "shape": "xxxx", "prefix": "s", "suffix": "nce", "length": 5, "cluster": "468", "prob": -7.615171909332275, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "|", "id": 223, "lower": "|", "norm": "|", "shape": "|", "prefix": "|", "suffix": "|", "length": 1, "cluster": "0", "prob": -7.6297454833984375, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "different", "id": 224, "lower": "different", "norm": "different", "shape": "xxxx", "prefix": "d", "suffix": "ent", "length": 9, "cluster": "1319", "prob": -7.630640506744385, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "3", "id": 225, "lower": "3", "norm": "3", "shape": "d", "prefix": "3", "suffix": "3", "length": 1, "cluster": "818", "prob": -7.636006832122803, "is_alpha": false, "is_ascii": true, "is_digit": true, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "last", "id": 226, "lower": "last", "norm": "last", "shape": "xxxx", "prefix": "l", "suffix": "ast", "length": 4, "cluster": "127", "prob": -7.636077404022217, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "own", "id": 227, "lower": "own", "norm": "own", "shape": "xxx", "prefix": "o", "suffix": "own", "length": 3, "cluster": "217", "prob": -7.636797904968262, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "1", "id": 228, "lower": "1", "norm": "1", "shape": "d", "prefix": "1", "suffix": "1", "length": 1, "cluster": "306", "prob": -7.639832973480225, "is_alpha": false, "is_ascii": true, "is_digit": true, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "us", "id": 229, "lower": "us", "norm": "us", "shape": "xx", "prefix": "u", "suffix": "us", "length": 2, "cluster": "1898", "prob": -7.643693923950195, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "%", "id": 230, "lower": "%", "norm": "%", "shape": "%", "prefix": "%", "suffix": "%", "length": 1, "cluster": "34", "prob": -7.645323753356934, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Not", "id": 231, "lower": "not", "norm": "Not", "shape": "Xxx", "prefix": "N", "suffix": "Not", "length": 3, "cluster": "1982", "prob": -7.65825080871582, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "put", "id": 232, "lower": "put", "norm": "put", "shape": "xxx", "prefix": "p", "suffix": "put", "length": 3, "cluster": "6314", "prob": -7.666473865509033, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "man", "id": 233, "lower": "man", "norm": "man", "shape": "xxx", "prefix": "m", "suffix": "man", "length": 3, "cluster": "549", "prob": -7.668745517730713, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "may", "id": 234, "lower": "may", "norm": "may", "shape": "xxx", "prefix": "m", "suffix": "may", "length": 3, "cluster": "186", "prob": -7.678494930267334, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "makes", "id": 235, "lower": "makes", "norm": "makes", "shape": "xxxx", "prefix": "m", "suffix": "kes", "length": 5, "cluster": "426", "prob": -7.684445858001709, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "money", "id": 236, "lower": "money", "norm": "money", "shape": "xxxx", "prefix": "m", "suffix": "ney", "length": 5, "cluster": "357", "prob": -7.693631172180176, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": ":)", "id": 237, "lower": ":)", "norm": ":)", "shape": ":)", "prefix": ":", "suffix": ":)", "length": 2, "cluster": "0", "prob": -7.694086074829102, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "without", "id": 238, "lower": "without", "norm": "without", "shape": "xxxx", "prefix": "w", "suffix": "out", "length": 7, "cluster": "57340", "prob": -7.694504261016846, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "bit", "id": 239, "lower": "bit", "norm": "bit", "shape": "xxx", "prefix": "b", "suffix": "bit", "length": 3, "cluster": "853", "prob": -7.721855640411377, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "person", "id": 240, "lower": "person", "norm": "person", "shape": "xxxx", "prefix": "p", "suffix": "son", "length": 6, "cluster": "549", "prob": -7.727076530456543, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Also", "id": 241, "lower": "also", "norm": "Also", "shape": "Xxxx", "prefix": "A", "suffix": "lso", "length": 4, "cluster": "254", "prob": -7.734253406524658, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "again", "id": 242, "lower": "again", "norm": "again", "shape": "xxxx", "prefix": "a", "suffix": "ain", "length": 5, "cluster": "28522", "prob": -7.7370924949646, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Just", "id": 243, "lower": "just", "norm": "Just", "shape": "Xxxx", "prefix": "J", "suffix": "ust", "length": 4, "cluster": "1982", "prob": -7.743429183959961, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "both", "id": 244, "lower": "both", "norm": "both", "shape": "xxxx", "prefix": "b", "suffix": "oth", "length": 4, "cluster": "1007", "prob": -7.750914573669434, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "help", "id": 245, "lower": "help", "norm": "help", "shape": "xxxx", "prefix": "h", "suffix": "elp", "length": 4, "cluster": "309", "prob": -7.758815288543701, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "trying", "id": 246, "lower": "trying", "norm": "trying", "shape": "xxxx", "prefix": "t", "suffix": "ing", "length": 6, "cluster": "14378", "prob": -7.759474754333496, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "least", "id": 247, "lower": "least", "norm": "least", "shape": "xxxx", "prefix": "l", "suffix": "ast", "length": 5, "cluster": "3690", "prob": -7.7660088539123535, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "come", "id": 248, "lower": "come", "norm": "come", "shape": "xxxx", "prefix": "c", "suffix": "ome", "length": 4, "cluster": "7562", "prob": -7.775856971740723, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "keep", "id": 249, "lower": "keep", "norm": "keep", "shape": "xxxx", "prefix": "k", "suffix": "eep", "length": 4, "cluster": "3338", "prob": -7.778285980224609, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Thanks", "id": 250, "lower": "thanks", "norm": "Thanks", "shape": "Xxxxx", "prefix": "T", "suffix": "nks", "length": 6, "cluster": "510", "prob": -7.781467914581299, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "read", "id": 251, "lower": "read", "norm": "read", "shape": "xxxx", "prefix": "r", "suffix": "ead", "length": 4, "cluster": "6314", "prob": -7.787075042724609, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "nt", "id": 252, "lower": "nt", "norm": "nt", "shape": "xx", "prefix": "n", "suffix": "nt", "length": 2, "cluster": "3685", "prob": -7.788322925567627, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "part", "id": 253, "lower": "part", "norm": "part", "shape": "xxxx", "prefix": "p", "suffix": "art", "length": 4, "cluster": "725", "prob": -7.791079521179199, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "let", "id": 254, "lower": "let", "norm": "let", "shape": "xxx", "prefix": "l", "suffix": "let", "length": 3, "cluster": "522", "prob": -7.795135974884033, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "hard", "id": 255, "lower": "hard", "norm": "hard", "shape": "xxxx", "prefix": "h", "suffix": "ard", "length": 4, "cluster": "2538", "prob": -7.795384407043457, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "another", "id": 256, "lower": "another", "norm": "another", "shape": "xxxx", "prefix": "a", "suffix": "her", "length": 7, "cluster": "28650", "prob": -7.801506519317627, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "end", "id": 257, "lower": "end", "norm": "end", "shape": "xxx", "prefix": "e", "suffix": "end", "length": 3, "cluster": "21", "prob": -7.816553115844727, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "having", "id": 258, "lower": "having", "norm": "having", "shape": "xxxx", "prefix": "h", "suffix": "ing", "length": 6, "cluster": "130026", "prob": -7.818792819976807, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "As", "id": 259, "lower": "as", "norm": "As", "shape": "Xx", "prefix": "A", "suffix": "As", "length": 2, "cluster": "958", "prob": -7.836142539978027, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "games", "id": 260, "lower": "games", "norm": "games", "shape": "xxxx", "prefix": "g", "suffix": "mes", "length": 5, "cluster": "1485", "prob": -7.836157321929932, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "already", "id": 261, "lower": "already", "norm": "already", "shape": "xxxx", "prefix": "a", "suffix": "ady", "length": 7, "cluster": "634", "prob": -7.838688850402832, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "..", "id": 0, "lower": "..", "norm": "..", "shape": "..", "prefix": ".", "suffix": "..", "length": 2, "cluster": "4906", "prob": -7.840396404266357, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "problem", "id": 262, "lower": "problem", "norm": "problem", "shape": "xxxx", "prefix": "p", "suffix": "lem", "length": 7, "cluster": "16069", "prob": -7.841479301452637, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "kind", "id": 263, "lower": "kind", "norm": "kind", "shape": "xxxx", "prefix": "k", "suffix": "ind", "length": 4, "cluster": "213", "prob": -7.844367980957031, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "old", "id": 264, "lower": "old", "norm": "old", "shape": "xxx", "prefix": "o", "suffix": "old", "length": 3, "cluster": "2346", "prob": -7.845602989196777, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "everyone", "id": 265, "lower": "everyone", "norm": "everyone", "shape": "xxxx", "prefix": "e", "suffix": "one", "length": 8, "cluster": "30698", "prob": -7.850788116455078, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "saying", "id": 266, "lower": "saying", "norm": "saying", "shape": "xxxx", "prefix": "s", "suffix": "ing", "length": 6, "cluster": "3732", "prob": -7.854340076446533, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "idea", "id": 267, "lower": "idea", "norm": "idea", "shape": "xxxx", "prefix": "i", "suffix": "dea", "length": 4, "cluster": "709", "prob": -7.855560779571533, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "else", "id": 268, "lower": "else", "norm": "else", "shape": "xxxx", "prefix": "e", "suffix": "lse", "length": 4, "cluster": "2013", "prob": -7.86043643951416, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "reason", "id": 269, "lower": "reason", "norm": "reason", "shape": "xxxx", "prefix": "r", "suffix": "son", "length": 6, "cluster": "113", "prob": -7.867291450500488, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Well", "id": 270, "lower": "well", "norm": "Well", "shape": "Xxxx", "prefix": "W", "suffix": "ell", "length": 4, "cluster": "1726", "prob": -7.871857643127441, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "less", "id": 271, "lower": "less", "norm": "less", "shape": "xxxx", "prefix": "l", "suffix": "ess", "length": 4, "cluster": "5610", "prob": -7.872425079345703, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "world", "id": 272, "lower": "world", "norm": "world", "shape": "xxxx", "prefix": "w", "suffix": "rld", "length": 5, "cluster": "329", "prob": -7.8744120597839355, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "wrong", "id": 273, "lower": "wrong", "norm": "wrong", "shape": "xxxx", "prefix": "w", "suffix": "ong", "length": 5, "cluster": "4586", "prob": -7.876842021942139, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "How", "id": 274, "lower": "how", "norm": "How", "shape": "Xxx", "prefix": "H", "suffix": "How", "length": 3, "cluster": "702", "prob": -7.879385948181152, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "far", "id": 275, "lower": "far", "norm": "far", "shape": "xxx", "prefix": "f", "suffix": "far", "length": 3, "cluster": "6890", "prob": -7.8802924156188965, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "big", "id": 276, "lower": "big", "norm": "big", "shape": "xxx", "prefix": "b", "suffix": "big", "length": 3, "cluster": "135", "prob": -7.880735874176025, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "done", "id": 277, "lower": "done", "norm": "done", "shape": "xxxx", "prefix": "d", "suffix": "one", "length": 4, "cluster": "26282", "prob": -7.886453151702881, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "believe", "id": 278, "lower": "believe", "norm": "believe", "shape": "xxxx", "prefix": "b", "suffix": "eve", "length": 7, "cluster": "138", "prob": -7.886724948883057, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Yeah", "id": 279, "lower": "yeah", "norm": "Yeah", "shape": "Xxxx", "prefix": "Y", "suffix": "eah", "length": 4, "cluster": "1726", "prob": -7.890377044677734, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "such", "id": 280, "lower": "such", "norm": "such", "shape": "xxxx", "prefix": "s", "suffix": "uch", "length": 4, "cluster": "111", "prob": -7.894707679748535, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "stuff", "id": 281, "lower": "stuff", "norm": "stuff", "shape": "xxxx", "prefix": "s", "suffix": "uff", "length": 5, "cluster": "6853", "prob": -7.898244380950928, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "away", "id": 282, "lower": "away", "norm": "away", "shape": "xxxx", "prefix": "a", "suffix": "way", "length": 4, "cluster": "3434", "prob": -7.9017462730407715, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "nothing", "id": 283, "lower": "nothing", "norm": "nothing", "shape": "xxxx", "prefix": "n", "suffix": "ing", "length": 7, "cluster": "14314", "prob": -7.909971714019775, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "tell", "id": 284, "lower": "tell", "norm": "tell", "shape": "xxxx", "prefix": "t", "suffix": "ell", "length": 4, "cluster": "1546", "prob": -7.910365581512451, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "looking", "id": 285, "lower": "looking", "norm": "looking", "shape": "xxxx", "prefix": "l", "suffix": "ing", "length": 7, "cluster": "1066", "prob": -7.911639213562012, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "start", "id": 286, "lower": "start", "norm": "start", "shape": "xxxx", "prefix": "s", "suffix": "art", "length": 5, "cluster": "3978", "prob": -7.923925876617432, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "using", "id": 287, "lower": "using", "norm": "using", "shape": "xxxx", "prefix": "u", "suffix": "ing", "length": 5, "cluster": "7146", "prob": -7.938363075256348, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "able", "id": 288, "lower": "able", "norm": "able", "shape": "xxxx", "prefix": "a", "suffix": "ble", "length": 4, "cluster": "6186", "prob": -7.939544677734375, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "place", "id": 289, "lower": "place", "norm": "place", "shape": "xxxx", "prefix": "p", "suffix": "ace", "length": 5, "cluster": "6245", "prob": -7.954748153686523, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "high", "id": 290, "lower": "high", "norm": "high", "shape": "xxxx", "prefix": "h", "suffix": "igh", "length": 4, "cluster": "167", "prob": -7.963760852813721, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "until", "id": 291, "lower": "until", "norm": "until", "shape": "xxxx", "prefix": "u", "suffix": "til", "length": 5, "cluster": "2516", "prob": -7.964784622192383, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "either", "id": 292, "lower": "either", "norm": "either", "shape": "xxxx", "prefix": "e", "suffix": "her", "length": 6, "cluster": "30698", "prob": -7.965897560119629, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "seen", "id": 293, "lower": "seen", "norm": "seen", "shape": "xxxx", "prefix": "s", "suffix": "een", "length": 4, "cluster": "26282", "prob": -7.97322416305542, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "times", "id": 294, "lower": "times", "norm": "times", "shape": "xxxx", "prefix": "t", "suffix": "mes", "length": 5, "cluster": "61", "prob": -7.9734907150268555, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "real", "id": 295, "lower": "real", "norm": "real", "shape": "xxxx", "prefix": "r", "suffix": "eal", "length": 4, "cluster": "503", "prob": -7.981620788574219, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "When", "id": 296, "lower": "when", "norm": "When", "shape": "Xxxx", "prefix": "W", "suffix": "hen", "length": 4, "cluster": "190", "prob": -7.982150554656982, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "making", "id": 297, "lower": "making", "norm": "making", "shape": "xxxx", "prefix": "m", "suffix": "ing", "length": 6, "cluster": "7146", "prob": -7.985988616943359, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "seems", "id": 298, "lower": "seems", "norm": "seems", "shape": "xxxx", "prefix": "s", "suffix": "ems", "length": 5, "cluster": "16298", "prob": -7.989145278930664, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "fuck", "id": 299, "lower": "fuck", "norm": "fuck", "shape": "xxxx", "prefix": "f", "suffix": "uck", "length": 4, "cluster": "0", "prob": -7.992913246154785, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "fucking", "id": 300, "lower": "fucking", "norm": "fucking", "shape": "xxxx", "prefix": "f", "suffix": "ing", "length": 7, "cluster": "0", "prob": -7.993165969848633, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "\n\n\n", "id": 0, "lower": "\n\n\n", "norm": "\n\n\n", "shape": "\n\n\n", "prefix": "\n", "suffix": "\n\n\n", "length": 3, "cluster": "0", "prob": -7.996075630187988, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "next", "id": 301, "lower": "next", "norm": "next", "shape": "xxxx", "prefix": "n", "suffix": "ext", "length": 4, "cluster": "255", "prob": -7.996739864349365, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "anyone", "id": 302, "lower": "anyone", "norm": "anyone", "shape": "xxxx", "prefix": "a", "suffix": "one", "length": 6, "cluster": "30698", "prob": -7.997350215911865, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "#", "id": 303, "lower": "#", "norm": "#", "shape": "#", "prefix": "#", "suffix": "#", "length": 1, "cluster": "18", "prob": -8.001263618469238, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "looks", "id": 304, "lower": "looks", "norm": "looks", "shape": "xxxx", "prefix": "l", "suffix": "oks", "length": 5, "cluster": "2442", "prob": -8.001678466796875, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "everything", "id": 305, "lower": "everything", "norm": "everything", "shape": "xxxx", "prefix": "e", "suffix": "ing", "length": 10, "cluster": "14314", "prob": -8.00584602355957, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Oh", "id": 306, "lower": "oh", "norm": "Oh", "shape": "Xx", "prefix": "O", "suffix": "Oh", "length": 2, "cluster": "1726", "prob": -8.007224082946777, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "nice", "id": 307, "lower": "nice", "norm": "nice", "shape": "xxxx", "prefix": "n", "suffix": "ice", "length": 4, "cluster": "551", "prob": -8.009806632995605, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "once", "id": 308, "lower": "once", "norm": "once", "shape": "xxxx", "prefix": "o", "suffix": "nce", "length": 4, "cluster": "22250", "prob": -8.010163307189941, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "show", "id": 309, "lower": "show", "norm": "show", "shape": "xxxx", "prefix": "s", "suffix": "how", "length": 4, "cluster": "7690", "prob": -8.011373519897461, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "maybe", "id": 310, "lower": "maybe", "norm": "maybe", "shape": "xxxx", "prefix": "m", "suffix": "ybe", "length": 5, "cluster": "60650", "prob": -8.020626068115234, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "fact", "id": 311, "lower": "fact", "norm": "fact", "shape": "xxxx", "prefix": "f", "suffix": "act", "length": 4, "cluster": "369", "prob": -8.032754898071289, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "wo", "id": 312, "lower": "wo", "norm": "wo", "shape": "xx", "prefix": "w", "suffix": "wo", "length": 2, "cluster": "26", "prob": -8.0400972366333, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "5", "id": 313, "lower": "5", "norm": "5", "shape": "d", "prefix": "5", "suffix": "5", "length": 1, "cluster": "818", "prob": -8.040534019470215, "is_alpha": false, "is_ascii": true, "is_digit": true, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "free", "id": 314, "lower": "free", "norm": "free", "shape": "xxxx", "prefix": "f", "suffix": "ree", "length": 4, "cluster": "6634", "prob": -8.0440092086792, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "understand", "id": 315, "lower": "understand", "norm": "understand", "shape": "xxxx", "prefix": "u", "suffix": "and", "length": 10, "cluster": "3722", "prob": -8.052404403686523, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "team", "id": 316, "lower": "team", "norm": "team", "shape": "xxxx", "prefix": "t", "suffix": "eam", "length": 4, "cluster": "1061", "prob": -8.053070068359375, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "....", "id": 317, "lower": "....", "norm": "....", "shape": "....", "prefix": ".", "suffix": "...", "length": 4, "cluster": "1202", "prob": -8.05477523803711, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "against", "id": 318, "lower": "against", "norm": "against", "shape": "xxxx", "prefix": "a", "suffix": "nst", "length": 7, "cluster": "24572", "prob": -8.064282417297363, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "live", "id": 319, "lower": "live", "norm": "live", "shape": "xxxx", "prefix": "l", "suffix": "ive", "length": 4, "cluster": "1418", "prob": -8.065953254699707, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": " \n\n", "id": 0, "lower": " \n\n", "norm": " \n\n", "shape": " \n\n", "prefix": " ", "suffix": " \n\n", "length": 3, "cluster": "0", "prob": -8.068946838378906, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Why", "id": 320, "lower": "why", "norm": "Why", "shape": "Xxx", "prefix": "W", "suffix": "Why", "length": 3, "cluster": "702", "prob": -8.06901741027832, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "whole", "id": 321, "lower": "whole", "norm": "whole", "shape": "xxxx", "prefix": "w", "suffix": "ole", "length": 5, "cluster": "71", "prob": -8.070209503173828, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "For", "id": 322, "lower": "for", "norm": "For", "shape": "Xxx", "prefix": "F", "suffix": "For", "length": 3, "cluster": "1342", "prob": -8.072200775146484, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "guys", "id": 323, "lower": "guys", "norm": "guys", "shape": "xxxx", "prefix": "g", "suffix": "uys", "length": 4, "cluster": "365", "prob": -8.075167655944824, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "job", "id": 324, "lower": "job", "norm": "job", "shape": "xxx", "prefix": "j", "suffix": "job", "length": 3, "cluster": "37", "prob": -8.082273483276367, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "etc", "id": 325, "lower": "etc", "norm": "etc", "shape": "xxx", "prefix": "e", "suffix": "etc", "length": 3, "cluster": "26", "prob": -8.087606430053711, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "4", "id": 326, "lower": "4", "norm": "4", "shape": "d", "prefix": "4", "suffix": "4", "length": 1, "cluster": "818", "prob": -8.088510513305664, "is_alpha": false, "is_ascii": true, "is_digit": true, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "went", "id": 327, "lower": "went", "norm": "went", "shape": "xxxx", "prefix": "w", "suffix": "ent", "length": 4, "cluster": "7338", "prob": -8.091073989868164, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "school", "id": 328, "lower": "school", "norm": "school", "shape": "xxxx", "prefix": "s", "suffix": "ool", "length": 6, "cluster": "1829", "prob": -8.096077919006348, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "guess", "id": 329, "lower": "guess", "norm": "guess", "shape": "xxxx", "prefix": "g", "suffix": "ess", "length": 5, "cluster": "650", "prob": -8.097951889038086, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "friends", "id": 330, "lower": "friends", "norm": "friends", "shape": "xxxx", "prefix": "f", "suffix": "nds", "length": 7, "cluster": "3565", "prob": -8.10158634185791, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "between", "id": 331, "lower": "between", "norm": "between", "shape": "xxxx", "prefix": "b", "suffix": "een", "length": 7, "cluster": "12284", "prob": -8.106386184692383, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "case", "id": 332, "lower": "case", "norm": "case", "shape": "xxxx", "prefix": "c", "suffix": "ase", "length": 4, "cluster": "3269", "prob": -8.106882095336914, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "She", "id": 333, "lower": "she", "norm": "She", "shape": "Xxx", "prefix": "S", "suffix": "She", "length": 3, "cluster": "126", "prob": -8.119241714477539, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "each", "id": 334, "lower": "each", "norm": "each", "shape": "xxxx", "prefix": "e", "suffix": "ach", "length": 4, "cluster": "32746", "prob": -8.123948097229004, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "fun", "id": 335, "lower": "fun", "norm": "fun", "shape": "xxx", "prefix": "f", "suffix": "fun", "length": 3, "cluster": "16229", "prob": -8.124406814575195, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "agree", "id": 336, "lower": "agree", "norm": "agree", "shape": "xxxx", "prefix": "a", "suffix": "ree", "length": 5, "cluster": "394", "prob": -8.12778091430664, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Is", "id": 337, "lower": "is", "norm": "Is", "shape": "Xx", "prefix": "I", "suffix": "Is", "length": 2, "cluster": "1214", "prob": -8.129456520080566, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "buy", "id": 338, "lower": "buy", "norm": "buy", "shape": "xxx", "prefix": "b", "suffix": "buy", "length": 3, "cluster": "2826", "prob": -8.142950057983398, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Yes", "id": 339, "lower": "yes", "norm": "Yes", "shape": "Xxx", "prefix": "Y", "suffix": "Yes", "length": 3, "cluster": "1726", "prob": -8.147512435913086, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "run", "id": 340, "lower": "run", "norm": "run", "shape": "xxx", "prefix": "r", "suffix": "run", "length": 3, "cluster": "437", "prob": -8.156776428222656, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "change", "id": 341, "lower": "change", "norm": "change", "shape": "xxxx", "prefix": "c", "suffix": "nge", "length": 6, "cluster": "2997", "prob": -8.157740592956543, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "found", "id": 342, "lower": "found", "norm": "found", "shape": "xxxx", "prefix": "f", "suffix": "und", "length": 5, "cluster": "13738", "prob": -8.182107925415039, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "question", "id": 343, "lower": "question", "norm": "question", "shape": "xxxx", "prefix": "q", "suffix": "ion", "length": 8, "cluster": "709", "prob": -8.185464859008789, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "top", "id": 344, "lower": "top", "norm": "top", "shape": "xxx", "prefix": "t", "suffix": "top", "length": 3, "cluster": "1479", "prob": -8.191086769104004, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "playing", "id": 345, "lower": "playing", "norm": "playing", "shape": "xxxx", "prefix": "p", "suffix": "ing", "length": 7, "cluster": "11242", "prob": -8.191595077514648, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "name", "id": 346, "lower": "name", "norm": "name", "shape": "xxxx", "prefix": "n", "suffix": "ame", "length": 4, "cluster": "4021", "prob": -8.19616985321045, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "mind", "id": 347, "lower": "mind", "norm": "mind", "shape": "xxxx", "prefix": "m", "suffix": "ind", "length": 4, "cluster": "1893", "prob": -8.197138786315918, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "myself", "id": 348, "lower": "myself", "norm": "myself", "shape": "xxxx", "prefix": "m", "suffix": "elf", "length": 6, "cluster": "8042", "prob": -8.200143814086914, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "gets", "id": 349, "lower": "gets", "norm": "gets", "shape": "xxxx", "prefix": "g", "suffix": "ets", "length": 4, "cluster": "10666", "prob": -8.202808380126953, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "ago", "id": 350, "lower": "ago", "norm": "ago", "shape": "xxx", "prefix": "a", "suffix": "ago", "length": 3, "cluster": "6442", "prob": -8.206598281860352, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "friend", "id": 351, "lower": "friend", "norm": "friend", "shape": "xxxx", "prefix": "f", "suffix": "end", "length": 6, "cluster": "1061", "prob": -8.210515975952148, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "talking", "id": 352, "lower": "talking", "norm": "talking", "shape": "xxxx", "prefix": "t", "suffix": "ing", "length": 7, "cluster": "4586", "prob": -8.22729778289795, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "days", "id": 353, "lower": "days", "norm": "days", "shape": "xxxx", "prefix": "d", "suffix": "ays", "length": 4, "cluster": "317", "prob": -8.227437973022461, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "yet", "id": 354, "lower": "yet", "norm": "yet", "shape": "xxx", "prefix": "y", "suffix": "yet", "length": 3, "cluster": "32490", "prob": -8.229137420654297, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "means", "id": 355, "lower": "means", "norm": "means", "shape": "xxxx", "prefix": "m", "suffix": "ans", "length": 5, "cluster": "31146", "prob": -8.234617233276367, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "hope", "id": 356, "lower": "hope", "norm": "hope", "shape": "xxxx", "prefix": "h", "suffix": "ope", "length": 4, "cluster": "650", "prob": -8.236272811889648, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "almost", "id": 357, "lower": "almost", "norm": "almost", "shape": "xxxx", "prefix": "a", "suffix": "ost", "length": 6, "cluster": "7402", "prob": -8.236738204956055, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "yourself", "id": 358, "lower": "yourself", "norm": "yourself", "shape": "xxxx", "prefix": "y", "suffix": "elf", "length": 8, "cluster": "8042", "prob": -8.2402982711792, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "awesome", "id": 359, "lower": "awesome", "norm": "awesome", "shape": "xxxx", "prefix": "a", "suffix": "ome", "length": 7, "cluster": "871", "prob": -8.247021675109863, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "care", "id": 360, "lower": "care", "norm": "care", "shape": "xxxx", "prefix": "c", "suffix": "are", "length": 4, "cluster": "1229", "prob": -8.248679161071777, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "quite", "id": 361, "lower": "quite", "norm": "quite", "shape": "xxxx", "prefix": "q", "suffix": "ite", "length": 5, "cluster": "15338", "prob": -8.254060745239258, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "10", "id": 362, "lower": "10", "norm": "10", "shape": "dd", "prefix": "1", "suffix": "10", "length": 2, "cluster": "1970", "prob": -8.258377075195312, "is_alpha": false, "is_ascii": true, "is_digit": true, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "true", "id": 363, "lower": "true", "norm": "true", "shape": "xxxx", "prefix": "t", "suffix": "rue", "length": 4, "cluster": "4586", "prob": -8.259368896484375, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "remember", "id": 364, "lower": "remember", "norm": "remember", "shape": "xxxx", "prefix": "r", "suffix": "ber", "length": 8, "cluster": "3722", "prob": -8.259916305541992, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "definitely", "id": 365, "lower": "definitely", "norm": "definitely", "shape": "xxxx", "prefix": "d", "suffix": "ely", "length": 10, "cluster": "7802", "prob": -8.264209747314453, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "call", "id": 366, "lower": "call", "norm": "call", "shape": "xxxx", "prefix": "c", "suffix": "all", "length": 4, "cluster": "3765", "prob": -8.267317771911621, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "pay", "id": 367, "lower": "pay", "norm": "pay", "shape": "xxx", "prefix": "p", "suffix": "pay", "length": 3, "cluster": "7946", "prob": -8.26932144165039, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "stop", "id": 368, "lower": "stop", "norm": "stop", "shape": "xxxx", "prefix": "s", "suffix": "top", "length": 4, "cluster": "3338", "prob": -8.272970199584961, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "set", "id": 369, "lower": "set", "norm": "set", "shape": "xxx", "prefix": "s", "suffix": "set", "length": 3, "cluster": "2218", "prob": -8.285635948181152, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "started", "id": 370, "lower": "started", "norm": "started", "shape": "xxxx", "prefix": "s", "suffix": "ted", "length": 7, "cluster": "3242", "prob": -8.286487579345703, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "instead", "id": 371, "lower": "instead", "norm": "instead", "shape": "xxxx", "prefix": "i", "suffix": "ead", "length": 7, "cluster": "2005", "prob": -8.292781829833984, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "story", "id": 372, "lower": "story", "norm": "story", "shape": "xxxx", "prefix": "s", "suffix": "ory", "length": 5, "cluster": "6853", "prob": -8.293317794799805, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "level", "id": 373, "lower": "level", "norm": "level", "shape": "xxxx", "prefix": "l", "suffix": "vel", "length": 5, "cluster": "6117", "prob": -8.29642391204834, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "left", "id": 374, "lower": "left", "norm": "left", "shape": "xxxx", "prefix": "l", "suffix": "eft", "length": 4, "cluster": "54954", "prob": -8.296669006347656, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "week", "id": 375, "lower": "week", "norm": "week", "shape": "xxxx", "prefix": "w", "suffix": "eek", "length": 4, "cluster": "157", "prob": -8.300933837890625, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "system", "id": 376, "lower": "system", "norm": "system", "shape": "xxxx", "prefix": "s", "suffix": "tem", "length": 6, "cluster": "4901", "prob": -8.303738594055176, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "full", "id": 377, "lower": "full", "norm": "full", "shape": "xxxx", "prefix": "f", "suffix": "ull", "length": 4, "cluster": "4071", "prob": -8.303950309753418, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "rather", "id": 378, "lower": "rather", "norm": "rather", "shape": "xxxx", "prefix": "r", "suffix": "her", "length": 6, "cluster": "6698", "prob": -8.312031745910645, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "video", "id": 379, "lower": "video", "norm": "video", "shape": "xxxx", "prefix": "v", "suffix": "deo", "length": 5, "cluster": "1975", "prob": -8.316000938415527, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "home", "id": 380, "lower": "home", "norm": "home", "shape": "xxxx", "prefix": "h", "suffix": "ome", "length": 4, "cluster": "1013", "prob": -8.316133499145508, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "women", "id": 381, "lower": "women", "norm": "women", "shape": "xxxx", "prefix": "w", "suffix": "men", "length": 5, "cluster": "877", "prob": -8.317564964294434, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "usually", "id": 382, "lower": "usually", "norm": "usually", "shape": "xxxx", "prefix": "u", "suffix": "lly", "length": 7, "cluster": "3706", "prob": -8.324220657348633, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "side", "id": 383, "lower": "side", "norm": "side", "shape": "xxxx", "prefix": "s", "suffix": "ide", "length": 4, "cluster": "8037", "prob": -8.327798843383789, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "wanted", "id": 384, "lower": "wanted", "norm": "wanted", "shape": "xxxx", "prefix": "w", "suffix": "ted", "length": 6, "cluster": "30634", "prob": -8.329934120178223, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "sense", "id": 385, "lower": "sense", "norm": "sense", "shape": "xxxx", "prefix": "s", "suffix": "nse", "length": 5, "cluster": "613", "prob": -8.338400840759277, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Your", "id": 386, "lower": "your", "norm": "Your", "shape": "Xxxx", "prefix": "Y", "suffix": "our", "length": 4, "cluster": "94", "prob": -8.347208023071289, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "second", "id": 387, "lower": "second", "norm": "second", "shape": "xxxx", "prefix": "s", "suffix": "ond", "length": 6, "cluster": "31", "prob": -8.351142883300781, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "comment", "id": 388, "lower": "comment", "norm": "comment", "shape": "xxxx", "prefix": "c", "suffix": "ent", "length": 7, "cluster": "757", "prob": -8.35578727722168, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "course", "id": 389, "lower": "course", "norm": "course", "shape": "xxxx", "prefix": "c", "suffix": "rse", "length": 6, "cluster": "1009", "prob": -8.35777759552002, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "ask", "id": 390, "lower": "ask", "norm": "ask", "shape": "xxx", "prefix": "a", "suffix": "ask", "length": 3, "cluster": "1546", "prob": -8.35922622680664, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Or", "id": 391, "lower": "or", "norm": "Or", "shape": "Xx", "prefix": "O", "suffix": "Or", "length": 2, "cluster": "1726", "prob": -8.361105918884277, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "seem", "id": 392, "lower": "seem", "norm": "seem", "shape": "xxxx", "prefix": "s", "suffix": "eem", "length": 4, "cluster": "906", "prob": -8.363061904907227, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Maybe", "id": 393, "lower": "maybe", "norm": "Maybe", "shape": "Xxxxx", "prefix": "M", "suffix": "ybe", "length": 5, "cluster": "190", "prob": -8.364654541015625, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "must", "id": 394, "lower": "must", "norm": "must", "shape": "xxxx", "prefix": "m", "suffix": "ust", "length": 4, "cluster": "698", "prob": -8.365957260131836, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Then", "id": 395, "lower": "then", "norm": "Then", "shape": "Xxxx", "prefix": "T", "suffix": "hen", "length": 4, "cluster": "1726", "prob": -8.369159698486328, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "small", "id": 396, "lower": "small", "norm": "small", "shape": "xxxx", "prefix": "s", "suffix": "all", "length": 5, "cluster": "391", "prob": -8.371565818786621, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "car", "id": 397, "lower": "car", "norm": "car", "shape": "xxx", "prefix": "c", "suffix": "car", "length": 3, "cluster": "1145", "prob": -8.374984741210938, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "hate", "id": 398, "lower": "hate", "norm": "hate", "shape": "xxxx", "prefix": "h", "suffix": "ate", "length": 4, "cluster": "906", "prob": -8.380099296569824, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "came", "id": 399, "lower": "came", "norm": "came", "shape": "xxxx", "prefix": "c", "suffix": "ame", "length": 4, "cluster": "15530", "prob": -8.382718086242676, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "watch", "id": 400, "lower": "watch", "norm": "watch", "shape": "xxxx", "prefix": "w", "suffix": "tch", "length": 5, "cluster": "3765", "prob": -8.386272430419922, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "experience", "id": 401, "lower": "experience", "norm": "experience", "shape": "xxxx", "prefix": "e", "suffix": "nce", "length": 10, "cluster": "2917", "prob": -8.387101173400879, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "cool", "id": 402, "lower": "cool", "norm": "cool", "shape": "xxxx", "prefix": "c", "suffix": "ool", "length": 4, "cluster": "565", "prob": -8.393746376037598, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "matter", "id": 403, "lower": "matter", "norm": "matter", "shape": "xxxx", "prefix": "m", "suffix": "ter", "length": 6, "cluster": "4805", "prob": -8.395515441894531, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "others", "id": 404, "lower": "others", "norm": "others", "shape": "xxxx", "prefix": "o", "suffix": "ers", "length": 6, "cluster": "1901", "prob": -8.396527290344238, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "completely", "id": 405, "lower": "completely", "norm": "completely", "shape": "xxxx", "prefix": "c", "suffix": "ely", "length": 10, "cluster": "12010", "prob": -8.40324592590332, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "All", "id": 406, "lower": "all", "norm": "All", "shape": "Xxx", "prefix": "A", "suffix": "All", "length": 3, "cluster": "1214", "prob": -8.403707504272461, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "called", "id": 407, "lower": "called", "norm": "called", "shape": "xxxx", "prefix": "c", "suffix": "led", "length": 6, "cluster": "11946", "prob": -8.404229164123535, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "under", "id": 408, "lower": "under", "norm": "under", "shape": "xxxx", "prefix": "u", "suffix": "der", "length": 5, "cluster": "32764", "prob": -8.406200408935547, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "yes", "id": 409, "lower": "yes", "norm": "yes", "shape": "xxx", "prefix": "y", "suffix": "yes", "length": 3, "cluster": "15146", "prob": -8.41097354888916, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Now", "id": 410, "lower": "now", "norm": "Now", "shape": "Xxx", "prefix": "N", "suffix": "Now", "length": 3, "cluster": "1726", "prob": -8.417712211608887, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Please", "id": 411, "lower": "please", "norm": "Please", "shape": "Xxxxx", "prefix": "P", "suffix": "ase", "length": 6, "cluster": "3582", "prob": -8.41897964477539, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "worth", "id": 412, "lower": "worth", "norm": "worth", "shape": "xxxx", "prefix": "w", "suffix": "rth", "length": 5, "cluster": "981", "prob": -8.423324584960938, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "says", "id": 413, "lower": "says", "norm": "says", "shape": "xxxx", "prefix": "s", "suffix": "ays", "length": 4, "cluster": "244", "prob": -8.426565170288086, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "comes", "id": 414, "lower": "comes", "norm": "comes", "shape": "xxxx", "prefix": "c", "suffix": "mes", "length": 5, "cluster": "15530", "prob": -8.428640365600586, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "fine", "id": 415, "lower": "fine", "norm": "fine", "shape": "xxxx", "prefix": "f", "suffix": "ine", "length": 4, "cluster": "8057", "prob": -8.428781509399414, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Thank", "id": 416, "lower": "thank", "norm": "Thank", "shape": "Xxxxx", "prefix": "T", "suffix": "ank", "length": 5, "cluster": "190", "prob": -8.434432983398438, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": " \n", "id": 0, "lower": " \n", "norm": " \n", "shape": " \n", "prefix": " ", "suffix": " \n", "length": 2, "cluster": "0", "prob": -8.435208320617676, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "works", "id": 417, "lower": "works", "norm": "works", "shape": "xxxx", "prefix": "w", "suffix": "rks", "length": 5, "cluster": "77", "prob": -8.436944961547852, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "exactly", "id": 418, "lower": "exactly", "norm": "exactly", "shape": "xxxx", "prefix": "e", "suffix": "tly", "length": 7, "cluster": "15338", "prob": -8.43747615814209, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "heard", "id": 419, "lower": "heard", "norm": "heard", "shape": "xxxx", "prefix": "h", "suffix": "ard", "length": 5, "cluster": "26282", "prob": -8.4396333694458, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "possible", "id": 420, "lower": "possible", "norm": "possible", "shape": "xxxx", "prefix": "p", "suffix": "ble", "length": 8, "cluster": "2535", "prob": -8.44277572631836, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "thinking", "id": 421, "lower": "thinking", "norm": "thinking", "shape": "xxxx", "prefix": "t", "suffix": "ing", "length": 8, "cluster": "4586", "prob": -8.442947387695312, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "hours", "id": 422, "lower": "hours", "norm": "hours", "shape": "xxxx", "prefix": "h", "suffix": "urs", "length": 5, "cluster": "957", "prob": -8.445417404174805, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "working", "id": 423, "lower": "working", "norm": "working", "shape": "xxxx", "prefix": "w", "suffix": "ing", "length": 7, "cluster": "27626", "prob": -8.44786262512207, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "took", "id": 424, "lower": "took", "norm": "took", "shape": "xxxx", "prefix": "t", "suffix": "ook", "length": 4, "cluster": "27050", "prob": -8.452874183654785, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "thanks", "id": 425, "lower": "thanks", "norm": "thanks", "shape": "xxxx", "prefix": "t", "suffix": "nks", "length": 6, "cluster": "554", "prob": -8.457283973693848, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "head", "id": 426, "lower": "head", "norm": "head", "shape": "xxxx", "prefix": "h", "suffix": "ead", "length": 4, "cluster": "1813", "prob": -8.458500862121582, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "power", "id": 427, "lower": "power", "norm": "power", "shape": "xxxx", "prefix": "p", "suffix": "wer", "length": 5, "cluster": "11621", "prob": -8.460216522216797, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "happen", "id": 428, "lower": "happen", "norm": "happen", "shape": "xxxx", "prefix": "h", "suffix": "pen", "length": 6, "cluster": "3466", "prob": -8.465093612670898, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "goes", "id": 429, "lower": "goes", "norm": "goes", "shape": "xxxx", "prefix": "g", "suffix": "oes", "length": 4, "cluster": "7338", "prob": -8.465673446655273, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Good", "id": 430, "lower": "good", "norm": "Good", "shape": "Xxxx", "prefix": "G", "suffix": "ood", "length": 4, "cluster": "614", "prob": -8.468016624450684, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "saw", "id": 431, "lower": "saw", "norm": "saw", "shape": "xxx", "prefix": "s", "suffix": "saw", "length": 3, "cluster": "6570", "prob": -8.472514152526855, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "please", "id": 432, "lower": "please", "norm": "please", "shape": "xxxx", "prefix": "p", "suffix": "ase", "length": 6, "cluster": "309", "prob": -8.473013877868652, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "couple", "id": 433, "lower": "couple", "norm": "couple", "shape": "xxxx", "prefix": "c", "suffix": "ple", "length": 6, "cluster": "853", "prob": -8.47309398651123, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "hit", "id": 434, "lower": "hit", "norm": "hit", "shape": "xxx", "prefix": "h", "suffix": "hit", "length": 3, "cluster": "682", "prob": -8.473491668701172, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "likely", "id": 435, "lower": "likely", "norm": "likely", "shape": "xxxx", "prefix": "l", "suffix": "ely", "length": 6, "cluster": "42", "prob": -8.47359561920166, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "ones", "id": 436, "lower": "ones", "norm": "ones", "shape": "xxxx", "prefix": "o", "suffix": "nes", "length": 4, "cluster": "15821", "prob": -8.474469184875488, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "often", "id": 437, "lower": "often", "norm": "often", "shape": "xxxx", "prefix": "o", "suffix": "ten", "length": 5, "cluster": "3706", "prob": -8.476237297058105, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "talk", "id": 438, "lower": "talk", "norm": "talk", "shape": "xxxx", "prefix": "t", "suffix": "alk", "length": 4, "cluster": "394", "prob": -8.479889869689941, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "issue", "id": 439, "lower": "issue", "norm": "issue", "shape": "xxxx", "prefix": "i", "suffix": "sue", "length": 5, "cluster": "3525", "prob": -8.48391342163086, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "easy", "id": 440, "lower": "easy", "norm": "easy", "shape": "xxxx", "prefix": "e", "suffix": "asy", "length": 4, "cluster": "2538", "prob": -8.489182472229004, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "One", "id": 441, "lower": "one", "norm": "One", "shape": "Xxx", "prefix": "O", "suffix": "One", "length": 3, "cluster": "350", "prob": -8.494391441345215, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "needs", "id": 442, "lower": "needs", "norm": "needs", "shape": "xxxx", "prefix": "n", "suffix": "eds", "length": 5, "cluster": "14250", "prob": -8.49528694152832, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "add", "id": 443, "lower": "add", "norm": "add", "shape": "xxx", "prefix": "a", "suffix": "add", "length": 3, "cluster": "3594", "prob": -8.496837615966797, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "support", "id": 444, "lower": "support", "norm": "support", "shape": "xxxx", "prefix": "s", "suffix": "ort", "length": 7, "cluster": "7861", "prob": -8.503355026245117, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "face", "id": 445, "lower": "face", "norm": "face", "shape": "xxxx", "prefix": "f", "suffix": "ace", "length": 4, "cluster": "1685", "prob": -8.504852294921875, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "hand", "id": 446, "lower": "hand", "norm": "hand", "shape": "xxxx", "prefix": "h", "suffix": "and", "length": 4, "cluster": "8037", "prob": -8.504961967468262, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "half", "id": 447, "lower": "half", "norm": "half", "shape": "xxxx", "prefix": "h", "suffix": "alf", "length": 4, "cluster": "469", "prob": -8.508658409118652, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "check", "id": 448, "lower": "check", "norm": "check", "shape": "xxxx", "prefix": "c", "suffix": "eck", "length": 5, "cluster": "2485", "prob": -8.512067794799805, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "night", "id": 449, "lower": "night", "norm": "night", "shape": "xxxx", "prefix": "n", "suffix": "ght", "length": 5, "cluster": "93", "prob": -8.517072677612305, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "months", "id": 450, "lower": "months", "norm": "months", "shape": "xxxx", "prefix": "m", "suffix": "ths", "length": 6, "cluster": "445", "prob": -8.517988204956055, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "kids", "id": 451, "lower": "kids", "norm": "kids", "shape": "xxxx", "prefix": "k", "suffix": "ids", "length": 4, "cluster": "877", "prob": -8.520237922668457, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "players", "id": 452, "lower": "players", "norm": "players", "shape": "xxxx", "prefix": "p", "suffix": "ers", "length": 7, "cluster": "3565", "prob": -8.520515441894531, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "line", "id": 453, "lower": "line", "norm": "line", "shape": "xxxx", "prefix": "l", "suffix": "ine", "length": 4, "cluster": "3941", "prob": -8.522600173950195, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "told", "id": 454, "lower": "told", "norm": "told", "shape": "xxxx", "prefix": "t", "suffix": "old", "length": 4, "cluster": "20138", "prob": -8.52303409576416, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "example", "id": 455, "lower": "example", "norm": "example", "shape": "xxxx", "prefix": "e", "suffix": "ple", "length": 7, "cluster": "497", "prob": -8.523116111755371, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "played", "id": 456, "lower": "played", "norm": "played", "shape": "xxxx", "prefix": "p", "suffix": "yed", "length": 6, "cluster": "32426", "prob": -8.528886795043945, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "reddit", "id": 457, "lower": "reddit", "norm": "reddit", "shape": "xxxx", "prefix": "r", "suffix": "dit", "length": 6, "cluster": "0", "prob": -8.52908992767334, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "based", "id": 458, "lower": "based", "norm": "based", "shape": "xxxx", "prefix": "b", "suffix": "sed", "length": 5, "cluster": "1578", "prob": -8.53032112121582, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "tried", "id": 459, "lower": "tried", "norm": "tried", "shape": "xxxx", "prefix": "t", "suffix": "ied", "length": 5, "cluster": "28586", "prob": -8.532145500183105, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "sounds", "id": 460, "lower": "sounds", "norm": "sounds", "shape": "xxxx", "prefix": "s", "suffix": "nds", "length": 6, "cluster": "2442", "prob": -8.53985595703125, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "link", "id": 461, "lower": "link", "norm": "link", "shape": "xxxx", "prefix": "l", "suffix": "ink", "length": 4, "cluster": "5829", "prob": -8.540618896484375, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "girl", "id": 462, "lower": "girl", "norm": "girl", "shape": "xxxx", "prefix": "g", "suffix": "irl", "length": 4, "cluster": "549", "prob": -8.542597770690918, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "open", "id": 463, "lower": "open", "norm": "open", "shape": "xxxx", "prefix": "o", "suffix": "pen", "length": 4, "cluster": "1589", "prob": -8.553583145141602, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "To", "id": 464, "lower": "to", "norm": "To", "shape": "Xx", "prefix": "T", "suffix": "To", "length": 2, "cluster": "3582", "prob": -8.557126998901367, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "taking", "id": 465, "lower": "taking", "norm": "taking", "shape": "xxxx", "prefix": "t", "suffix": "ing", "length": 6, "cluster": "31722", "prob": -8.55748462677002, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "happened", "id": 466, "lower": "happened", "norm": "happened", "shape": "xxxx", "prefix": "h", "suffix": "ned", "length": 8, "cluster": "5290", "prob": -8.559469223022461, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "during", "id": 467, "lower": "during", "norm": "during", "shape": "xxxx", "prefix": "d", "suffix": "ing", "length": 6, "cluster": "262140", "prob": -8.559581756591797, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "deal", "id": 468, "lower": "deal", "norm": "deal", "shape": "xxxx", "prefix": "d", "suffix": "eal", "length": 4, "cluster": "5829", "prob": -8.560197830200195, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "single", "id": 469, "lower": "single", "norm": "single", "shape": "xxxx", "prefix": "s", "suffix": "gle", "length": 6, "cluster": "71", "prob": -8.571329116821289, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "family", "id": 470, "lower": "family", "norm": "family", "shape": "xxxx", "prefix": "f", "suffix": "ily", "length": 6, "cluster": "1061", "prob": -8.571907043457031, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "close", "id": 471, "lower": "close", "norm": "close", "shape": "xxxx", "prefix": "c", "suffix": "ose", "length": 5, "cluster": "53", "prob": -8.581155776977539, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "happy", "id": 472, "lower": "happy", "norm": "happy", "shape": "xxxx", "prefix": "h", "suffix": "ppy", "length": 5, "cluster": "4586", "prob": -8.581560134887695, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "move", "id": 473, "lower": "move", "norm": "move", "shape": "xxxx", "prefix": "m", "suffix": "ove", "length": 4, "cluster": "7093", "prob": -8.582797050476074, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "number", "id": 474, "lower": "number", "norm": "number", "shape": "xxxx", "prefix": "n", "suffix": "ber", "length": 6, "cluster": "341", "prob": -8.584420204162598, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "US", "id": 475, "lower": "us", "norm": "US", "shape": "XX", "prefix": "U", "suffix": "US", "length": 2, "cluster": "1642", "prob": -8.585862159729004, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": true, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "water", "id": 476, "lower": "water", "norm": "water", "shape": "xxxx", "prefix": "w", "suffix": "ter", "length": 5, "cluster": "3705", "prob": -8.589462280273438, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "men", "id": 477, "lower": "men", "norm": "men", "shape": "xxx", "prefix": "m", "suffix": "men", "length": 3, "cluster": "877", "prob": -8.59007453918457, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "yeah", "id": 478, "lower": "yeah", "norm": "yeah", "shape": "xxxx", "prefix": "y", "suffix": "eah", "length": 4, "cluster": "26", "prob": -8.593489646911621, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "later", "id": 479, "lower": "later", "norm": "later", "shape": "xxxx", "prefix": "l", "suffix": "ter", "length": 5, "cluster": "5866", "prob": -8.603795051574707, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "whatever", "id": 480, "lower": "whatever", "norm": "whatever", "shape": "xxxx", "prefix": "w", "suffix": "ver", "length": 8, "cluster": "2026", "prob": -8.610091209411621, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "government", "id": 481, "lower": "government", "norm": "government", "shape": "xxxx", "prefix": "g", "suffix": "ent", "length": 10, "cluster": "297", "prob": -8.610445022583008, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "6", "id": 482, "lower": "6", "norm": "6", "shape": "d", "prefix": "6", "suffix": "6", "length": 1, "cluster": "50", "prob": -8.611133575439453, "is_alpha": false, "is_ascii": true, "is_digit": true, "is_lower": false, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "house", "id": 483, "lower": "house", "norm": "house", "shape": "xxxx", "prefix": "h", "suffix": "use", "length": 5, "cluster": "37", "prob": -8.613367080688477, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "similar", "id": 484, "lower": "similar", "norm": "similar", "shape": "xxxx", "prefix": "s", "suffix": "lar", "length": 7, "cluster": "295", "prob": -8.613471031188965, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "wait", "id": 485, "lower": "wait", "norm": "wait", "shape": "xxxx", "prefix": "w", "suffix": "ait", "length": 4, "cluster": "3765", "prob": -8.613734245300293, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "questions", "id": 486, "lower": "questions", "norm": "questions", "shape": "xxxx", "prefix": "q", "suffix": "ons", "length": 9, "cluster": "1165", "prob": -8.613752365112305, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "sex", "id": 487, "lower": "sex", "norm": "sex", "shape": "xxx", "prefix": "s", "suffix": "sex", "length": 3, "cluster": "633", "prob": -8.613862991333008, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "especially", "id": 488, "lower": "especially", "norm": "especially", "shape": "xxxx", "prefix": "e", "suffix": "lly", "length": 10, "cluster": "27882", "prob": -8.616527557373047, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "lol", "id": 489, "lower": "lol", "norm": "lol", "shape": "xxx", "prefix": "l", "suffix": "lol", "length": 3, "cluster": "0", "prob": -8.621257781982422, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "Because", "id": 490, "lower": "because", "norm": "Because", "shape": "Xxxxx", "prefix": "B", "suffix": "use", "length": 7, "cluster": "1214", "prob": -8.623008728027344, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "God", "id": 491, "lower": "god", "norm": "God", "shape": "Xxx", "prefix": "G", "suffix": "God", "length": 3, "cluster": "422", "prob": -8.62376594543457, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
diff --git a/website/api/_annotation/_training.jade b/website/api/_annotation/_training.jade
index 9a5e96628..4e37ee2b1 100644
--- a/website/api/_annotation/_training.jade
+++ b/website/api/_annotation/_training.jade
@@ -98,7 +98,7 @@ p
     }
 
 p
-    |  Here's an example of the 500 most frequent lexemes in the English
+    |  Here's an example of the 100 most frequent lexemes in the English
     |  training data:
 
-+github("spacy", "examples/training/vocab-data.json", false, false, "json")
++github("spacy", "examples/training/vocab-data.jsonl", false, false, "json")

From 4112a991ec012b175a1a97add51ce04d09351886 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 30 Oct 2017 19:44:40 +0100
Subject: [PATCH 34/90] Fix vector pruning

---
 spacy/vectors.pyx | 26 ++++++++++++++------------
 spacy/vocab.pyx   | 17 +++++++++++++----
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 368b73866..552a6bcf3 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -30,7 +30,8 @@ cdef class Vectors:
     cdef readonly StringStore strings
     cdef public object key2row
     cdef public object keys
-    cdef public int i
+    cdef public int _i_key
+    cdef public int _i_vec
 
     def __init__(self, strings, width=0, data=None):
         """Create a new vector store. To keep the vector table empty, pass
@@ -53,7 +54,8 @@ cdef class Vectors:
             self.data = numpy.asarray(data, dtype='f')
         else:
             self.data = numpy.zeros((len(self.strings), width), dtype='f')
-        self.i = 0
+        self._i_key = 0
+        self._i_vec = 0
         self.key2row = {}
         self.keys = numpy.zeros((self.data.shape[0],), dtype='uint64')
         if data is not None:
@@ -105,7 +107,7 @@ cdef class Vectors:
 
         RETURNS (int): The number of vectors in the data.
         """
-        return self.i
+        return self._i_vec
 
     def __contains__(self, key):
         """Check whether a key has a vector entry in the table.
@@ -127,20 +129,20 @@ cdef class Vectors:
         """
         if isinstance(key, basestring_):
             key = self.strings.add(key)
-        if key in self.key2row and row is None:
+        if row is None and key in self.key2row:
             row = self.key2row[key]
-        elif key in self.key2row and row is not None:
-            self.key2row[key] = row
         elif row is None:
-            row = self.i
-            self.i += 1
-        if row >= self.keys.shape[0]:
-            self.keys.resize((row*2,))
+            row = self._i_vec
+            self._i_vec += 1
+        if row >= self.data.shape[0]:
             self.data.resize((row*2, self.data.shape[1]))
-            self.keys[row] = key
+        if key not in self.key2row:
+            if self._i_key >= self.keys.shape[0]:
+                self.keys.resize((self._i_key*2,))
+                self.keys[self._i_key] = key
+                self._i_key += 1
 
         self.key2row[key] = row
-        self.keys[row] = key
         if vector is not None:
             self.data[row] = vector
         return row
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index ff6c5b844..ecf1ad9d9 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -248,7 +248,7 @@ cdef class Vocab:
             width = self.vectors.data.shape[1]
         self.vectors = Vectors(self.strings, width=width)
 
-    def prune_vectors(self, nr_row, batch_size=1024):
+    def prune_vectors(self, nr_row, batch_size=8):
         """Reduce the current vector table to `nr_row` unique entries. Words
         mapped to the discarded vectors will be remapped to the closest vector
         among those remaining.
@@ -267,22 +267,31 @@ cdef class Vocab:
         xp = get_array_module(self.vectors.data)
         # Work in batches, to avoid memory problems.
         keep = self.vectors.data[:nr_row]
+        keep_keys = [key for key, row in self.vectors.key2row.items() if row < nr_row]
         toss = self.vectors.data[nr_row:]
         # Normalize the vectors, so cosine similarity is just dot product.
         # Note we can't modify the ones we're keeping in-place...
-        keep = keep / (xp.linalg.norm(keep)+1e-8)
+        keep = keep / (xp.linalg.norm(keep, axis=1, keepdims=True)+1e-8)
         keep = xp.ascontiguousarray(keep.T)
         neighbours = xp.zeros((toss.shape[0],), dtype='i')
+        scores = xp.zeros((toss.shape[0],), dtype='f')
         for i in range(0, toss.shape[0], batch_size):
             batch = toss[i : i+batch_size]
-            batch /= xp.linalg.norm(batch)+1e-8
-            neighbours[i:i+batch_size] = xp.dot(batch, keep).argmax(axis=1)
+            batch /= xp.linalg.norm(batch, axis=1, keepdims=True)+1e-8
+            sims = xp.dot(batch, keep)
+            matches = sims.argmax(axis=1)
+            neighbours[i:i+batch_size] = matches
+            scores[i:i+batch_size] = sims.max(axis=1)
         for lex in self:
             # If we're losing the vector for this word, map it to the nearest
             # vector we're keeping.
             if lex.rank >= nr_row:
                 lex.rank = neighbours[lex.rank-nr_row]
                 self.vectors.add(lex.orth, row=lex.rank)
+        for key in self.vectors.keys:
+            row = self.vectors.key2row[key]
+            if row >= nr_row:
+                self.vectors.key2row[key] = neighbours[row-nr_row]
         # Make copy, to encourage the original table to be garbage collected.
         self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row])
 

From 33af6ac69ad73b7e9245a8fa0cd6862bf569d73b Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 19:46:45 +0100
Subject: [PATCH 35/90] Use even smaller examle size

100 was still too much, so try 20 instead
---
 examples/training/vocab-data.jsonl     | 80 --------------------------
 website/api/_annotation/_training.jade |  2 +-
 2 files changed, 1 insertion(+), 81 deletions(-)

diff --git a/examples/training/vocab-data.jsonl b/examples/training/vocab-data.jsonl
index 3fdf5eede..2f129dd30 100644
--- a/examples/training/vocab-data.jsonl
+++ b/examples/training/vocab-data.jsonl
@@ -19,83 +19,3 @@
 {"orth": "\"", "id": 17, "lower": "\"", "norm": "\"", "shape": "\"", "prefix": "\"", "suffix": "\"", "length": 1, "cluster": "0", "prob": -5.02677583694458, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": true, "is_left_punct": true, "is_right_punct": true}
 {"orth": "?", "id": 18, "lower": "?", "norm": "?", "shape": "?", "prefix": "?", "suffix": "?", "length": 1, "cluster": "0", "prob": -5.05924654006958, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
 {"orth": " ", "id": 0, "lower": " ", "norm": " ", "shape": " ", "prefix": " ", "suffix": " ", "length": 1, "cluster": "0", "prob": -5.129165172576904, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "have", "id": 19, "lower": "have", "norm": "have", "shape": "xxxx", "prefix": "h", "suffix": "ave", "length": 4, "cluster": "378", "prob": -5.156484603881836, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "on", "id": 20, "lower": "on", "norm": "on", "shape": "xx", "prefix": "o", "suffix": "on", "length": 2, "cluster": "2044", "prob": -5.172736167907715, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "*", "id": 21, "lower": "*", "norm": "*", "shape": "*", "prefix": "*", "suffix": "*", "length": 1, "cluster": "5098", "prob": -5.1977410316467285, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": ")", "id": 22, "lower": ")", "norm": ")", "shape": ")", "prefix": ")", "suffix": ")", "length": 1, "cluster": "0", "prob": -5.197994232177734, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": true}
-{"orth": "be", "id": 23, "lower": "be", "norm": "be", "shape": "xx", "prefix": "b", "suffix": "be", "length": 2, "cluster": "458", "prob": -5.225094318389893, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "with", "id": 24, "lower": "with", "norm": "with", "shape": "xxxx", "prefix": "w", "suffix": "ith", "length": 4, "cluster": "1020", "prob": -5.243249893188477, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "do", "id": 25, "lower": "do", "norm": "do", "shape": "xx", "prefix": "d", "suffix": "do", "length": 2, "cluster": "2042", "prob": -5.246996879577637, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "was", "id": 26, "lower": "was", "norm": "was", "shape": "xxx", "prefix": "w", "suffix": "was", "length": 3, "cluster": "250", "prob": -5.252320289611816, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "are", "id": 27, "lower": "are", "norm": "are", "shape": "xxx", "prefix": "a", "suffix": "are", "length": 3, "cluster": "1530", "prob": -5.271068096160889, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "not", "id": 28, "lower": "not", "norm": "not", "shape": "xxx", "prefix": "n", "suffix": "not", "length": 3, "cluster": "1258", "prob": -5.332601070404053, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "but", "id": 29, "lower": "but", "norm": "but", "shape": "xxx", "prefix": "b", "suffix": "but", "length": 3, "cluster": "148", "prob": -5.3419694900512695, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "!", "id": 30, "lower": "!", "norm": "!", "shape": "!", "prefix": "!", "suffix": "!", "length": 1, "cluster": "0", "prob": -5.359641075134277, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "this", "id": 31, "lower": "this", "norm": "this", "shape": "xxxx", "prefix": "t", "suffix": "his", "length": 4, "cluster": "63", "prob": -5.36181640625, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "[", "id": 32, "lower": "[", "norm": "[", "shape": "[", "prefix": "[", "suffix": "[", "length": 1, "cluster": "0", "prob": -5.438112258911133, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": true, "is_right_punct": false}
-{"orth": "-", "id": 33, "lower": "-", "norm": "-", "shape": "-", "prefix": "-", "suffix": "-", "length": 1, "cluster": "36", "prob": -5.468655109405518, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "my", "id": 34, "lower": "my", "norm": "my", "shape": "xx", "prefix": "m", "suffix": "my", "length": 2, "cluster": "251", "prob": -5.491642951965332, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "they", "id": 35, "lower": "they", "norm": "they", "shape": "xxxx", "prefix": "t", "suffix": "hey", "length": 4, "cluster": "90", "prob": -5.5243682861328125, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "as", "id": 36, "lower": "as", "norm": "as", "shape": "xx", "prefix": "a", "suffix": "as", "length": 2, "cluster": "212", "prob": -5.53448486328125, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "like", "id": 37, "lower": "like", "norm": "like", "shape": "xxxx", "prefix": "l", "suffix": "ike", "length": 4, "cluster": "1684", "prob": -5.610429763793945, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "just", "id": 38, "lower": "just", "norm": "just", "shape": "xxxx", "prefix": "j", "suffix": "ust", "length": 4, "cluster": "31978", "prob": -5.630868434906006, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "your", "id": 39, "lower": "your", "norm": "your", "shape": "xxxx", "prefix": "y", "suffix": "our", "length": 4, "cluster": "251", "prob": -5.650108814239502, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "or", "id": 40, "lower": "or", "norm": "or", "shape": "xx", "prefix": "o", "suffix": "or", "length": 2, "cluster": "404", "prob": -5.654984951019287, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "(", "id": 41, "lower": "(", "norm": "(", "shape": "(", "prefix": "(", "suffix": "(", "length": 1, "cluster": "0", "prob": -5.75598669052124, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": true, "is_right_punct": false}
-{"orth": "at", "id": 42, "lower": "at", "norm": "at", "shape": "xx", "prefix": "a", "suffix": "at", "length": 2, "cluster": "124", "prob": -5.763442516326904, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "if", "id": 43, "lower": "if", "norm": "if", "shape": "xx", "prefix": "i", "suffix": "if", "length": 2, "cluster": "4052", "prob": -5.763589859008789, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "would", "id": 44, "lower": "would", "norm": "would", "shape": "xxxx", "prefix": "w", "suffix": "uld", "length": 5, "cluster": "1978", "prob": -5.772674560546875, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "so", "id": 45, "lower": "so", "norm": "so", "shape": "xx", "prefix": "s", "suffix": "so", "length": 2, "cluster": "2282", "prob": -5.823773384094238, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "can", "id": 46, "lower": "can", "norm": "can", "shape": "xxx", "prefix": "c", "suffix": "can", "length": 3, "cluster": "58", "prob": -5.827763080596924, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "me", "id": 47, "lower": "me", "norm": "me", "shape": "xx", "prefix": "m", "suffix": "me", "length": 2, "cluster": "1898", "prob": -5.846089839935303, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "about", "id": 48, "lower": "about", "norm": "about", "shape": "xxxx", "prefix": "a", "suffix": "out", "length": 5, "cluster": "618", "prob": -5.906808853149414, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "he", "id": 49, "lower": "he", "norm": "he", "shape": "xx", "prefix": "h", "suffix": "he", "length": 2, "cluster": "218", "prob": -5.9319047927856445, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "It", "id": 50, "lower": "it", "norm": "It", "shape": "Xx", "prefix": "I", "suffix": "It", "length": 2, "cluster": "894", "prob": -5.93662691116333, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "all", "id": 51, "lower": "all", "norm": "all", "shape": "xxx", "prefix": "a", "suffix": "all", "length": 3, "cluster": "6122", "prob": -5.936640739440918, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "The", "id": 52, "lower": "the", "norm": "The", "shape": "Xxx", "prefix": "T", "suffix": "The", "length": 3, "cluster": "30", "prob": -5.958707332611084, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "get", "id": 53, "lower": "get", "norm": "get", "shape": "xxx", "prefix": "g", "suffix": "get", "length": 3, "cluster": "2570", "prob": -5.992605686187744, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "one", "id": 54, "lower": "one", "norm": "one", "shape": "xxx", "prefix": "o", "suffix": "one", "length": 3, "cluster": "8170", "prob": -5.996385097503662, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "'m", "id": 55, "lower": "'m", "norm": "'m", "shape": "'x", "prefix": "'", "suffix": "'m", "length": 2, "cluster": "3066", "prob": -5.9999823570251465, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "out", "id": 56, "lower": "out", "norm": "out", "shape": "xxx", "prefix": "o", "suffix": "out", "length": 3, "cluster": "1386", "prob": -6.0027008056640625, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "from", "id": 57, "lower": "from", "norm": "from", "shape": "xxxx", "prefix": "f", "suffix": "rom", "length": 4, "cluster": "380", "prob": -6.010132312774658, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "an", "id": 58, "lower": "an", "norm": "an", "shape": "xx", "prefix": "a", "suffix": "an", "length": 2, "cluster": "3", "prob": -6.014852046966553, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "what", "id": 59, "lower": "what", "norm": "what", "shape": "xxxx", "prefix": "w", "suffix": "hat", "length": 4, "cluster": "2026", "prob": -6.023346424102783, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "up", "id": 60, "lower": "up", "norm": "up", "shape": "xx", "prefix": "u", "suffix": "up", "length": 2, "cluster": "362", "prob": -6.028695583343506, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "]", "id": 61, "lower": "]", "norm": "]", "shape": "]", "prefix": "]", "suffix": "]", "length": 1, "cluster": "0", "prob": -6.0386552810668945, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": true}
-{"orth": "\n", "id": 0, "lower": "\n", "norm": "\n", "shape": "\n", "prefix": "\n", "suffix": "\n", "length": 1, "cluster": "0", "prob": -6.0506510734558105, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "people", "id": 62, "lower": "people", "norm": "people", "shape": "xxxx", "prefix": "p", "suffix": "ple", "length": 6, "cluster": "365", "prob": -6.0715765953063965, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "more", "id": 63, "lower": "more", "norm": "more", "shape": "xxxx", "prefix": "m", "suffix": "ore", "length": 4, "cluster": "1514", "prob": -6.081598281860352, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": ":", "id": 64, "lower": ":", "norm": ":", "shape": ":", "prefix": ":", "suffix": ":", "length": 1, "cluster": "228", "prob": -6.128875732421875, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "there", "id": 65, "lower": "there", "norm": "there", "shape": "xxxx", "prefix": "t", "suffix": "ere", "length": 5, "cluster": "986", "prob": -6.135282039642334, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "deleted", "id": 66, "lower": "deleted", "norm": "deleted", "shape": "xxxx", "prefix": "d", "suffix": "ted", "length": 7, "cluster": "1706", "prob": -6.1543049812316895, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "think", "id": 67, "lower": "think", "norm": "think", "shape": "xxxx", "prefix": "t", "suffix": "ink", "length": 5, "cluster": "1674", "prob": -6.180924892425537, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "will", "id": 68, "lower": "will", "norm": "will", "shape": "xxxx", "prefix": "w", "suffix": "ill", "length": 4, "cluster": "442", "prob": -6.199834823608398, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "them", "id": 69, "lower": "them", "norm": "them", "shape": "xxxx", "prefix": "t", "suffix": "hem", "length": 4, "cluster": "5994", "prob": -6.2177276611328125, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "we", "id": 70, "lower": "we", "norm": "we", "shape": "xx", "prefix": "w", "suffix": "we", "length": 2, "cluster": "1626", "prob": -6.230024337768555, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "'re", "id": 71, "lower": "'re", "norm": "'re", "shape": "'xx", "prefix": "'", "suffix": "'re", "length": 3, "cluster": "7162", "prob": -6.255462646484375, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "when", "id": 72, "lower": "when", "norm": "when", "shape": "xxxx", "prefix": "w", "suffix": "hen", "length": 4, "cluster": "16340", "prob": -6.2623114585876465, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "You", "id": 73, "lower": "you", "norm": "You", "shape": "Xxx", "prefix": "Y", "suffix": "You", "length": 3, "cluster": "858", "prob": -6.276494026184082, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "...", "id": 74, "lower": "...", "norm": "...", "shape": "...", "prefix": ".", "suffix": "...", "length": 3, "cluster": "966", "prob": -6.278521537780762, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "some", "id": 75, "lower": "some", "norm": "some", "shape": "xxxx", "prefix": "s", "suffix": "ome", "length": 4, "cluster": "239", "prob": -6.318882465362549, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "has", "id": 76, "lower": "has", "norm": "has", "shape": "xxx", "prefix": "h", "suffix": "has", "length": 3, "cluster": "890", "prob": -6.325605392456055, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "because", "id": 77, "lower": "because", "norm": "because", "shape": "xxxx", "prefix": "b", "suffix": "use", "length": 7, "cluster": "980", "prob": -6.349620342254639, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "know", "id": 78, "lower": "know", "norm": "know", "shape": "xxxx", "prefix": "k", "suffix": "now", "length": 4, "cluster": "3722", "prob": -6.368943214416504, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "really", "id": 79, "lower": "really", "norm": "really", "shape": "xxxx", "prefix": "r", "suffix": "lly", "length": 6, "cluster": "7802", "prob": -6.370757102966309, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "by", "id": 80, "lower": "by", "norm": "by", "shape": "xx", "prefix": "b", "suffix": "by", "length": 2, "cluster": "252", "prob": -6.375086784362793, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "time", "id": 81, "lower": "time", "norm": "time", "shape": "xxxx", "prefix": "t", "suffix": "ime", "length": 4, "cluster": "477", "prob": -6.3782219886779785, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "did", "id": 82, "lower": "did", "norm": "did", "shape": "xxx", "prefix": "d", "suffix": "did", "length": 3, "cluster": "8186", "prob": -6.389003753662109, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "no", "id": 83, "lower": "no", "norm": "no", "shape": "xx", "prefix": "n", "suffix": "no", "length": 2, "cluster": "4074", "prob": -6.402691841125488, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "had", "id": 84, "lower": "had", "norm": "had", "shape": "xxx", "prefix": "h", "suffix": "had", "length": 3, "cluster": "1914", "prob": -6.45427131652832, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "their", "id": 85, "lower": "their", "norm": "their", "shape": "xxxx", "prefix": "t", "suffix": "eir", "length": 5, "cluster": "187", "prob": -6.461463928222656, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "If", "id": 86, "lower": "if", "norm": "If", "shape": "Xx", "prefix": "I", "suffix": "If", "length": 2, "cluster": "190", "prob": -6.469156742095947, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "how", "id": 87, "lower": "how", "norm": "how", "shape": "xxx", "prefix": "h", "suffix": "how", "length": 3, "cluster": "10218", "prob": -6.496722221374512, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "does", "id": 88, "lower": "does", "norm": "does", "shape": "xxxx", "prefix": "d", "suffix": "oes", "length": 4, "cluster": "4090", "prob": -6.500738143920898, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "who", "id": 89, "lower": "who", "norm": "who", "shape": "xxx", "prefix": "w", "suffix": "who", "length": 3, "cluster": "410", "prob": -6.504637241363525, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "than", "id": 90, "lower": "than", "norm": "than", "shape": "xxxx", "prefix": "t", "suffix": "han", "length": 4, "cluster": "106", "prob": -6.512253761291504, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "good", "id": 91, "lower": "good", "norm": "good", "shape": "xxxx", "prefix": "g", "suffix": "ood", "length": 4, "cluster": "551", "prob": -6.518923759460449, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "only", "id": 92, "lower": "only", "norm": "only", "shape": "xxxx", "prefix": "o", "suffix": "nly", "length": 4, "cluster": "15594", "prob": -6.535442352294922, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "his", "id": 93, "lower": "his", "norm": "his", "shape": "xxx", "prefix": "h", "suffix": "his", "length": 3, "cluster": "123", "prob": -6.574275016784668, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "much", "id": 94, "lower": "much", "norm": "much", "shape": "xxxx", "prefix": "m", "suffix": "uch", "length": 4, "cluster": "2794", "prob": -6.584301948547363, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": ";", "id": 95, "lower": ";", "norm": ";", "shape": ";", "prefix": ";", "suffix": ";", "length": 1, "cluster": "36", "prob": -6.586422920227051, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "'ve", "id": 96, "lower": "'ve", "norm": "'ve", "shape": "'xx", "prefix": "'", "suffix": "'ve", "length": 3, "cluster": "1018", "prob": -6.593011379241943, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
-{"orth": "could", "id": 97, "lower": "could", "norm": "could", "shape": "xxxx", "prefix": "c", "suffix": "uld", "length": 5, "cluster": "954", "prob": -6.595959186553955, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
diff --git a/website/api/_annotation/_training.jade b/website/api/_annotation/_training.jade
index 4e37ee2b1..9bd59cdae 100644
--- a/website/api/_annotation/_training.jade
+++ b/website/api/_annotation/_training.jade
@@ -98,7 +98,7 @@ p
     }
 
 p
-    |  Here's an example of the 100 most frequent lexemes in the English
+    |  Here's an example of the 20 most frequent lexemes in the English
     |  training data:
 
 +github("spacy", "examples/training/vocab-data.jsonl", false, false, "json")

From 8ad4f3f6e506a7c93f2e0dc821e235262bd5cda5 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 19:48:35 +0100
Subject: [PATCH 36/90] Take out JSON format include in tagger/parser

---
 website/usage/_training/_tagger-parser.jade | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/website/usage/_training/_tagger-parser.jade b/website/usage/_training/_tagger-parser.jade
index f2fa4bab5..646f9ecb0 100644
--- a/website/usage/_training/_tagger-parser.jade
+++ b/website/usage/_training/_tagger-parser.jade
@@ -190,7 +190,3 @@ p
 
     +item
         |  #[strong Test] the model to make sure the parser works as expected.
-
-+h(3, "training-json") JSON format for training
-
-include ../../api/_annotation/_training

From 5af6c8b746c26f92e4859c01b65296edda2f9e7f Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 20:28:00 +0100
Subject: [PATCH 37/90] Update training docs

---
 website/_includes/_svg.jade          |  3 +++
 website/usage/_training/_basics.jade | 23 +++++++++++++++++------
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/website/_includes/_svg.jade b/website/_includes/_svg.jade
index 0f7266c0a..54e0667a3 100644
--- a/website/_includes/_svg.jade
+++ b/website/_includes/_svg.jade
@@ -62,6 +62,9 @@ svg(style="position: absolute; visibility: hidden; width: 0; height: 0;" width="
         symbol#svg_explosion(viewBox="0 0 500 500")
             path(fill="currentColor" d="M111.7 74.9L91.2 93.1l9.1 10.2 17.8-15.8 7.4 8.4-17.8 15.8 10.1 11.4 20.6-18.2 7.7 8.7-30.4 26.9-41.9-47.3 30.3-26.9 7.6 8.6zM190.8 59.6L219 84.3l-14.4 4.5-20.4-18.2-6.4 26.6-14.4 4.5 8.9-36.4-26.9-24.1 14.3-4.5L179 54.2l5.7-25.2 14.3-4.5-8.2 35.1zM250.1 21.2l27.1 3.4c6.1.8 10.8 3.1 14 7.2 3.2 4.1 4.5 9.2 3.7 15.5-.8 6.3-3.2 11-7.4 14.1-4.1 3.1-9.2 4.3-15.3 3.5L258 63.2l-2.8 22.3-13-1.6 7.9-62.7zm11.5 13l-2.2 17.5 12.6 1.6c5.1.6 9.1-2 9.8-7.6.7-5.6-2.5-9.2-7.6-9.9l-12.6-1.6zM329.1 95.4l23.8 13.8-5.8 10L312 98.8l31.8-54.6 11.3 6.6-26 44.6zM440.5 145c-1.3 8.4-5.9 15.4-13.9 21.1s-16.2 7.7-24.6 6.1c-8.4-1.6-15.3-6.3-20.8-14.1-5.5-7.9-7.6-16-6.4-24.4 1.3-8.5 6-15.5 14-21.1 8-5.6 16.2-7.7 24.5-6 8.4 1.6 15.4 6.3 20.9 14.2 5.5 7.6 7.6 15.7 6.3 24.2zM412 119c-5.1-.8-10.3.6-15.6 4.4-5.2 3.7-8.4 8.1-9.4 13.2-1 5.2.2 10.1 3.5 14.8 3.4 4.8 7.5 7.5 12.7 8.2 5.2.8 10.4-.7 15.6-4.4 5.3-3.7 8.4-8.1 9.4-13.2 1.1-5.1-.1-9.9-3.4-14.7-3.4-4.8-7.6-7.6-12.8-8.3zM471.5 237.9c-2.8 4.8-7.1 7.6-13 8.7l-2.6-13.1c5.3-.9 8.1-5 7.2-11-.9-5.8-4.3-8.8-8.9-8.2-2.3.3-3.7 1.4-4.5 3.3-.7 1.9-1.4 5.2-1.7 10.1-.8 7.5-2.2 13.1-4.3 16.9-2.1 3.9-5.7 6.2-10.9 7-6.3.9-11.3-.5-15.2-4.4-3.9-3.8-6.3-9-7.3-15.7-1.1-7.4-.2-13.7 2.6-18.8 2.8-5.1 7.4-8.2 13.7-9.2l2.6 13c-5.6 1.1-8.7 6.6-7.7 13.4 1 6.6 3.9 9.5 8.6 8.8 4.4-.7 5.7-4.5 6.7-14.1.3-3.5.7-6.2 1.1-8.4.4-2.2 1.2-4.4 2.2-6.8 2.1-4.7 6-7.2 11.8-8.1 5.4-.8 10.3.4 14.5 3.7 4.2 3.3 6.9 8.5 8 15.6.9 6.9-.1 12.6-2.9 17.3zM408.6 293.5l2.4-12.9 62 11.7-2.4 12.9-62-11.7zM419.6 396.9c-8.3 2-16.5.3-24.8-5-8.2-5.3-13.2-12.1-14.9-20.5-1.6-8.4.1-16.6 5.3-24.6 5.2-8.1 11.9-13.1 20.2-15.1 8.4-1.9 16.6-.3 24.9 5 8.2 5.3 13.2 12.1 14.8 20.5 1.7 8.4 0 16.6-5.2 24.7-5.2 8-12 13-20.3 15zm13.4-36.3c-1.2-5.1-4.5-9.3-9.9-12.8s-10.6-4.7-15.8-3.7-9.3 4-12.4 8.9-4.1 9.8-2.8 14.8c1.2 5.1 4.5 9.3 9.9 12.8 5.5 3.5 10.7 4.8 15.8 3.7 5.1-.9 9.2-3.8 12.3-8.7s4.1-9.9 2.9-15zM303.6 416.5l9.6-5.4 43.3 20.4-19.2-34 11.4-6.4 31 55-9.6 5.4-43.4-20.5 19.2 34.1-11.3 6.4-31-55zM238.2 468.8c-49 0-96.9-17.4-134.8-49-38.3-32-64-76.7-72.5-125.9-2-11.9-3.1-24-3.1-35.9 0-36.5 9.6-72.6 27.9-104.4 2.1-3.6 6.7-4.9 10.3-2.8 3.6 2.1 4.9 6.7 2.8 10.3-16.9 29.5-25.9 63.1-25.9 96.9 0 11.1 1 22.3 2.9 33.4 7.9 45.7 31.8 87.2 67.3 116.9 35.2 29.3 79.6 45.5 125.1 45.5 11.1 0 22.3-1 33.4-2.9 4.1-.7 8 2 8.7 6.1.7 4.1-2 8-6.1 8.7-11.9 2-24 3.1-36 3.1z")
 
+        symbol#svg_prodigy(viewBox="0 0 538.5 157.6")
+            path(fill="currentColor" d="M70.6 48.6c7 7.3 10.5 17.1 10.5 29.2S77.7 99.7 70.6 107c-6.9 7.3-15.9 11.1-27 11.1-9.4 0-16.8-2.7-21.7-8.2v44.8H0V39h20.7v8.1c4.8-6.4 12.4-9.6 22.9-9.6 11.1 0 20.1 3.7 27 11.1zM21.9 76v3.6c0 12.1 7.3 19.8 18.3 19.8 11.2 0 18.7-7.9 18.7-21.6s-7.5-21.6-18.7-21.6c-11 0-18.3 7.7-18.3 19.8zM133.8 59.4c-12.6 0-20.5 7-20.5 17.8v39.3h-22V39h21.1v8.8c4-6.4 11.2-9.6 21.3-9.6v21.2zM209.5 107.1c-7.6 7.3-17.5 11.1-29.5 11.1s-21.9-3.8-29.7-11.1c-7.6-7.5-11.5-17.2-11.5-29.2 0-12.1 3.9-21.9 11.5-29.2 7.8-7.3 17.7-11.1 29.7-11.1s21.9 3.8 29.5 11.1c7.8 7.3 11.7 17.1 11.7 29.2 0 11.9-3.9 21.7-11.7 29.2zM180 56.2c-5.7 0-10.3 1.9-13.8 5.8-3.5 3.8-5.2 9-5.2 15.7 0 6.7 1.8 12 5.2 15.7 3.4 3.8 8.1 5.7 13.8 5.7s10.3-1.9 13.8-5.7 5.2-9 5.2-15.7c0-6.8-1.8-12-5.2-15.7-3.5-3.8-8.1-5.8-13.8-5.8zM313 116.5h-20.5v-7.9c-4.4 5.5-12.7 9.6-23.1 9.6-10.9 0-19.9-3.8-27-11.1C235.5 99.7 232 90 232 77.8s3.5-21.9 10.3-29.2c7-7.3 16-11.1 27-11.1 9.7 0 17.1 2.7 21.9 8.2V0H313v116.5zm-58.8-38.7c0 13.6 7.5 21.4 18.7 21.4 10.9 0 18.3-7.3 18.3-19.8V76c0-12.2-7.3-19.8-18.3-19.8-11.2 0-18.7 8-18.7 21.6zM354.1 13.6c0 3.6-1.3 6.8-3.9 9.3-5 4.9-13.6 4.9-18.6 0-8.4-7.5-1.6-23.1 9.3-22.5 7.4 0 13.2 5.9 13.2 13.2zm-2.2 102.9H330V39h21.9v77.5zM425.1 47.1V39h20.5v80.4c0 11.2-3.6 20.1-10.6 26.8-7 6.7-16.6 10-28.5 10-23.4 0-36.9-11.4-39.9-29.8l21.7-.8c1 7.6 7.6 12 17.4 12 11.2 0 18.1-5.8 18.1-16.6v-11.1c-5.1 5.5-12.5 8.2-21.9 8.2-10.9 0-19.9-3.8-27-11.1-6.9-7.3-10.3-17.1-10.3-29.2s3.5-21.9 10.3-29.2c7-7.3 16-11.1 27-11.1 10.7 0 18.4 3.1 23.2 9.6zm-38.3 30.7c0 13.6 7.5 21.6 18.7 21.6 11 0 18.3-7.6 18.3-19.8V76c0-12.2-7.3-19.8-18.3-19.8-11.2 0-18.7 8-18.7 21.6zM488.8 154.8H465l19.8-45.1L454.5 39h24.1l17.8 46.2L514.2 39h24.3l-49.7 115.8z")
+
 
         //- Machine learning & NLP libraries
 
diff --git a/website/usage/_training/_basics.jade b/website/usage/_training/_basics.jade
index 77df3c433..d20648416 100644
--- a/website/usage/_training/_basics.jade
+++ b/website/usage/_training/_basics.jade
@@ -76,6 +76,16 @@ p
         ("Google rebrands its business apps", [(0, 6, "ORG")]),
         ("look what i found on google! 😂", [(21, 27, "PRODUCT")])]
 
++infobox("Tip: Try the Prodigy annotation tool")
+    +infobox-logos(["prodigy", 100, 29, "https://prodi.gy"])
+    |  If you need to label a lot of data, check out
+    |  #[+a("https://prodi.gy", true) Prodigy], a new, active learning-powered
+    |  annotation tool we've developed. Prodigy is fast and extensible, and
+    |  comes with a modern  #[strong web application] that helps you collect
+    |  training data faster. It integrates seamlessly with spaCy, pre-selects
+    |  the #[strong most relevant examples] for annotation, and lets you
+    |  train and evaluate ready-to-use spaCy models.
+
 +h(3, "annotations") Training with annotations
 
 p
@@ -180,9 +190,10 @@ p
         +cell #[code optimizer]
         +cell Callable to update the model's weights.
 
-+infobox
-    |  For the #[strong full example and more details], see the usage guide on
-    |  #[+a("/usage/training#ner") training the named entity recognizer],
-    |  or the runnable
-    |  #[+src(gh("spaCy", "examples/training/train_ner.py")) training script]
-    |  on GitHub.
+p
+    |  Instead of writing your own training loop, you can also use the
+    |  built-in #[+api("cli#train") #[code train]] command, which expects data
+    |  in spaCy's #[+a("/api/annotation#json-input") JSON format]. On each epoch,
+    |  a model will be saved out to the directory. After training, you can
+    |  use the #[+api("cli#package") #[code package]] command to generate an
+    |  installable Python package from your model.

From 368fdb389ad23d07b99604483a8f96ff5a11e1d0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 31 Oct 2017 02:00:26 +0100
Subject: [PATCH 38/90] WIP on refactoring and fixing vectors

---
 spacy/_ml.py                          | 16 ++++--
 spacy/cli/train.py                    |  5 ++
 spacy/tests/vocab/test_add_vectors.py | 44 ++++++++++++---
 spacy/vectors.pyx                     | 57 ++++++++-----------
 spacy/vocab.pyx                       | 81 ++++++++++++++++++++++++---
 5 files changed, 147 insertions(+), 56 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index c99f840b7..e9dac11df 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -29,6 +29,12 @@ from . import util
 VECTORS_KEY = 'spacy_pretrained_vectors'
 
 
+def cosine(vec1, vec2):
+    norm1 = (vec1**2).sum() ** 0.5
+    norm2 = (vec2**2).sum() ** 0.5
+    return vec1.dot(vec2) / (norm1 * norm2)
+
+
 @layerize
 def _flatten_add_lengths(seqs, pad=0, drop=0.):
     ops = Model.ops
@@ -198,11 +204,11 @@ class PrecomputableAffine(Model):
 def link_vectors_to_models(vocab):
     vectors = vocab.vectors
     ops = Model.ops
-    for word in vocab:
-        if word.orth in vectors.key2row:
-            word.rank = vectors.key2row[word.orth]
-        else:
-            word.rank = 0
+    #for word in vocab:
+    #    if word.orth in vectors.key2row:
+    #        word.rank = vectors.key2row[word.orth]
+    #    else:
+    #        word.rank = 0
     data = ops.asarray(vectors.data)
     # Set an entry here, so that vectors are accessed by StaticVectors
     # (unideal, I know)
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index fb96e6c05..2300c3b94 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -94,6 +94,11 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
     nlp.meta.update(meta)
     if vectors:
         util.load_model(vectors, vocab=nlp.vocab)
+        if vectors_limit is not None:
+            remap = nlp.vocab.prune_vectors(vectors_limit)
+            print('remap', len(remap))
+            for key, (value, sim) in remap.items():
+                print(repr(key), repr(value), sim)
     for name in pipeline:
         nlp.add_pipe(nlp.create_pipe(name), name=name)
     optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
diff --git a/spacy/tests/vocab/test_add_vectors.py b/spacy/tests/vocab/test_add_vectors.py
index 10477cdf1..0ce95e5e9 100644
--- a/spacy/tests/vocab/test_add_vectors.py
+++ b/spacy/tests/vocab/test_add_vectors.py
@@ -3,13 +3,41 @@ from __future__ import unicode_literals
 
 import numpy
 import pytest
+from ...vocab import Vocab
+from ..._ml import cosine
 
 
-@pytest.mark.xfail
-@pytest.mark.parametrize('text', ["Hello"])
-def test_vocab_add_vector(en_vocab, text):
-    en_vocab.resize_vectors(10)
-    lex = en_vocab[text]
-    lex.vector = numpy.ndarray((10,), dtype='float32')
-    lex = en_vocab[text]
-    assert lex.vector.shape == (10,)
+def test_vocab_add_vector():
+    vocab = Vocab()
+    data = numpy.ndarray((5,3), dtype='f')
+    data[0] = 1.
+    data[1] = 2.
+    vocab.set_vector(u'cat', data[0])
+    vocab.set_vector(u'dog', data[1])
+    cat = vocab[u'cat']
+    assert list(cat.vector) == [1., 1., 1.]
+    dog = vocab[u'dog']
+    assert list(dog.vector) == [2., 2., 2.]
+    for lex in vocab:
+        print(lex.orth_)
+
+
+def test_vocab_prune_vectors():
+    vocab = Vocab()
+    _ = vocab[u'cat']
+    _ = vocab[u'dog']
+    _ = vocab[u'kitten']
+    print(list(vocab.strings))
+    data = numpy.ndarray((5,3), dtype='f')
+    data[0] = 1.
+    data[1] = 2.
+    data[2] = 1.1
+    vocab.set_vector(u'cat', data[0])
+    vocab.set_vector(u'dog', data[1])
+    vocab.set_vector(u'kitten', data[2])
+    for lex in vocab:
+        print(lex.orth_)
+
+    remap = vocab.prune_vectors(2)
+    assert remap == {u'kitten': (u'cat', cosine(data[0], data[2]))}
+    #print(remap)
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 155d7b9d2..6a1bc876e 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -27,8 +27,7 @@ cdef class Vectors:
     cdef public object data
     cdef readonly StringStore strings
     cdef public object key2row
-    cdef public object keys
-    cdef public int i
+    cdef public int _i_vec
 
     def __init__(self, strings, width=0, data=None):
         """Create a new vector store. To keep the vector table empty, pass
@@ -51,13 +50,13 @@ cdef class Vectors:
             self.data = numpy.asarray(data, dtype='f')
         else:
             self.data = numpy.zeros((len(self.strings), width), dtype='f')
-        self.i = 0
+        self._i_vec = 0
         self.key2row = {}
-        self.keys = numpy.zeros((self.data.shape[0],), dtype='uint64')
-        for i, string in enumerate(self.strings):
-            if i >= self.data.shape[0]:
-                break
-            self.add(self.strings[string], self.data[i])
+        if data is not None:
+            for i, string in enumerate(self.strings):
+                if i >= self.data.shape[0]:
+                    break
+                self.add(self.strings[string], vector=self.data[i])
 
     def __reduce__(self):
         return (Vectors, (self.strings, self.data))
@@ -122,16 +121,15 @@ cdef class Vectors:
         """
         if isinstance(key, basestring_):
             key = self.strings.add(key)
-        if key not in self.key2row:
-            i = self.i
-            if i >= self.keys.shape[0]:
-                self.keys.resize((self.keys.shape[0]*2,))
-                self.data.resize((self.data.shape[0]*2, self.data.shape[1]))
-            self.key2row[key] = self.i
-            self.keys[self.i] = key
-            self.i += 1
-        else:
-            i = self.key2row[key]
+        if row is None and key in self.key2row:
+            row = self.key2row[key]
+        elif row is None:
+            row = self._i_vec
+            self._i_vec += 1
+        if row >= self.data.shape[0]:
+            self.data.resize((row*2, self.data.shape[1]))
+
+        self.key2row[key] = row
         if vector is not None:
             self.data[i] = vector
         return i
@@ -141,9 +139,9 @@ cdef class Vectors:
 
         YIELDS (tuple): A key/vector pair.
         """
-        for i, key in enumerate(self.keys):
+        for key, row in self.key2row.items():
             string = self.strings[key]
-            yield string, self.data[i]
+            yield string, self.data[row]
 
     @property
     def shape(self):
@@ -202,7 +200,7 @@ cdef class Vectors:
             save_array = lambda arr, file_: xp.save(file_, arr)
         serializers = OrderedDict((
             ('vectors', lambda p: save_array(self.data, p.open('wb'))),
-            ('keys', lambda p: xp.save(p.open('wb'), self.keys))
+            ('key2row', lambda p: msgpack.dump(self.key2row, p.open('wb')))
         ))
         return util.to_disk(path, serializers, exclude)
 
@@ -215,10 +213,7 @@ cdef class Vectors:
         """
         def load_keys(path):
             if path.exists():
-                self.keys = numpy.load(path2str(path))
-                for i, key in enumerate(self.keys):
-                    self.keys[i] = key
-                    self.key2row[key] = i
+                self.key2row = msgpack.load(path.open('rb'))
 
         def load_vectors(path):
             xp = Model.ops.xp
@@ -226,7 +221,7 @@ cdef class Vectors:
                 self.data = xp.load(path)
 
         serializers = OrderedDict((
-            ('keys', load_keys),
+            ('key2row', load_keys),
             ('vectors', load_vectors),
         ))
         util.from_disk(path, serializers, exclude)
@@ -244,7 +239,7 @@ cdef class Vectors:
             else:
                 return msgpack.dumps(self.data)
         serializers = OrderedDict((
-            ('keys', lambda: msgpack.dumps(self.keys)),
+            ('key2row', lambda: msgpack.dumps(self.key2row)),
             ('vectors', serialize_weights)
         ))
         return util.to_bytes(serializers, exclude)
@@ -262,14 +257,8 @@ cdef class Vectors:
             else:
                 self.data = msgpack.loads(b)
 
-        def load_keys(keys):
-            self.keys.resize((len(keys),))
-            for i, key in enumerate(keys):
-                self.keys[i] = key
-                self.key2row[key] = i
-
         deserializers = OrderedDict((
-            ('keys', lambda b: load_keys(msgpack.loads(b))),
+            ('key2row', lambda b: self.key2row.update(msgpack.loads(b))),
             ('vectors', deserialize_weights)
         ))
         util.from_bytes(data, deserializers, exclude)
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 8b09d7ee7..e3cad12e0 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -190,10 +190,11 @@ cdef class Vocab:
 
         YIELDS (Lexeme): An entry in the vocabulary.
         """
-        cdef attr_t orth
+        cdef attr_t key
         cdef size_t addr
-        for orth, addr in self._by_orth.items():
-            yield Lexeme(self, orth)
+        for key, addr in self._by_orth.items():
+            lex = Lexeme(self, key)
+            yield lex
 
     def __getitem__(self, id_or_string):
         """Retrieve a lexeme, given an int ID or a unicode string. If a
@@ -211,7 +212,7 @@ cdef class Vocab:
             >>> assert nlp.vocab[apple] == nlp.vocab[u'apple']
         """
         cdef attr_t orth
-        if type(id_or_string) == unicode:
+        if isinstance(id_or_string, unicode):
             orth = self.strings.add(id_or_string)
         else:
             orth = id_or_string
@@ -242,9 +243,69 @@ cdef class Vocab:
         """Drop the current vector table. Because all vectors must be the same
         width, you have to call this to change the size of the vectors.
         """
-        if new_dim is None:
-            new_dim = self.vectors.data.shape[1]
-        self.vectors = Vectors(self.strings, width=new_dim)
+        if width is None:
+            width = self.vectors.data.shape[1]
+        self.vectors = Vectors(self.strings, width=width)
+
+    def prune_vectors(self, nr_row, batch_size=8):
+        """Reduce the current vector table to `nr_row` unique entries. Words
+        mapped to the discarded vectors will be remapped to the closest vector
+        among those remaining.
+
+        For example, suppose the original table had vectors for the words:
+        ['sat', 'cat', 'feline', 'reclined']. If we prune the vector table to,
+        two rows, we would discard the vectors for 'feline' and 'reclined'.
+        These words would then be remapped to the closest remaining vector
+        -- so "feline" would have the same vector as "cat", and "reclined"
+        would have the same vector as "sat".
+
+        The similarities are judged by cosine. The original vectors may
+        be large, so the cosines are calculated in minibatches, to reduce
+        memory usage.
+
+        nr_row (int): The number of rows to keep in the vector table.
+        batch_size (int): Batch of vectors for calculating the similarities.
+            Larger batch sizes might be faster, while temporarily requiring
+            more memory.
+        RETURNS (dict): A dictionary keyed by removed words mapped to
+            `(string, score)` tuples, where `string` is the entry the removed
+            word was mapped to, and `score` the similarity score between the
+            two words.
+        """
+        xp = get_array_module(self.vectors.data)
+        # Work in batches, to avoid memory problems.
+        keep = self.vectors.data[:nr_row]
+        keep_keys = [key for key, row in self.vectors.key2row.items() if row < nr_row]
+        toss = self.vectors.data[nr_row:]
+        # Normalize the vectors, so cosine similarity is just dot product.
+        # Note we can't modify the ones we're keeping in-place...
+        keep = keep / (xp.linalg.norm(keep, axis=1, keepdims=True)+1e-8)
+        keep = xp.ascontiguousarray(keep.T)
+        neighbours = xp.zeros((toss.shape[0],), dtype='i')
+        scores = xp.zeros((toss.shape[0],), dtype='f')
+        for i in range(0, toss.shape[0]//2, batch_size):
+            batch = toss[i : i+batch_size]
+            batch /= xp.linalg.norm(batch, axis=1, keepdims=True)+1e-8
+            sims = xp.dot(batch, keep)
+            matches = sims.argmax(axis=1)
+            neighbours[i:i+batch_size] = matches
+            scores[i:i+batch_size] = sims.max(axis=1)
+        i2k = {i: key for key, i in self.vectors.key2row.items()}
+        remap = {}
+        for lex in list(self):
+            # If we're losing the vector for this word, map it to the nearest
+            # vector we're keeping.
+            if lex.rank >= nr_row:
+                lex.rank = neighbours[lex.rank-nr_row]
+                self.vectors.add(lex.orth, row=lex.rank)
+                remap[lex.orth_] = (i2k[lex.rank], scores[lex.rank])
+        for key, row in self.vectors.key2row.items():
+            if row >= nr_row:
+                self.vectors.key2row[key] = neighbours[row-nr_row]
+        # Make copy, to encourage the original table to be garbage collected.
+        self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row])
+        link_vectors_to_models(self)
+        return remap
 
     def get_vector(self, orth):
         """Retrieve a vector for a word in the vocabulary. Words can be looked
@@ -266,9 +327,11 @@ cdef class Vocab:
         """Set a vector for a word in the vocabulary. Words can be referenced
         by string or int ID.
         """
-        if not isinstance(orth, basestring_):
-            orth = self.strings[orth]
+        if self.vectors.data.size == 0:
+            self.clear_vectors(vector.shape[0])
+        lex = self[orth]
         self.vectors.add(orth, vector=vector)
+        lex.rank = self.vectors.key2row[lex.orth]
 
     def has_vector(self, orth):
         """Check whether a word has a vector. Returns False if no vectors have

From cb5217012f46b38f2768ed7b68680bdc33625f40 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 31 Oct 2017 11:40:46 +0100
Subject: [PATCH 39/90] Fix vector remapping

---
 spacy/_ml.py                          | 10 +++++++---
 spacy/tests/vocab/test_add_vectors.py | 13 +++++--------
 spacy/vocab.pyx                       | 10 ++++++----
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index e9dac11df..fa8e83d48 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -30,9 +30,13 @@ VECTORS_KEY = 'spacy_pretrained_vectors'
 
 
 def cosine(vec1, vec2):
-    norm1 = (vec1**2).sum() ** 0.5
-    norm2 = (vec2**2).sum() ** 0.5
-    return vec1.dot(vec2) / (norm1 * norm2)
+    xp = get_array_module(vec1)
+    norm1 = xp.linalg.norm(vec1)
+    norm2 = xp.linalg.norm(vec2)
+    if norm1 == 0. or norm2 == 0.:
+        return 0
+    else:
+        return vec1.dot(vec2) / (norm1 * norm2)
 
 
 @layerize
diff --git a/spacy/tests/vocab/test_add_vectors.py b/spacy/tests/vocab/test_add_vectors.py
index 0ce95e5e9..3cb0b632c 100644
--- a/spacy/tests/vocab/test_add_vectors.py
+++ b/spacy/tests/vocab/test_add_vectors.py
@@ -2,7 +2,7 @@
 from __future__ import unicode_literals
 
 import numpy
-import pytest
+from numpy.testing import assert_allclose
 from ...vocab import Vocab
 from ..._ml import cosine
 
@@ -18,8 +18,6 @@ def test_vocab_add_vector():
     assert list(cat.vector) == [1., 1., 1.]
     dog = vocab[u'dog']
     assert list(dog.vector) == [2., 2., 2.]
-    for lex in vocab:
-        print(lex.orth_)
 
 
 def test_vocab_prune_vectors():
@@ -27,7 +25,6 @@ def test_vocab_prune_vectors():
     _ = vocab[u'cat']
     _ = vocab[u'dog']
     _ = vocab[u'kitten']
-    print(list(vocab.strings))
     data = numpy.ndarray((5,3), dtype='f')
     data[0] = 1.
     data[1] = 2.
@@ -35,9 +32,9 @@ def test_vocab_prune_vectors():
     vocab.set_vector(u'cat', data[0])
     vocab.set_vector(u'dog', data[1])
     vocab.set_vector(u'kitten', data[2])
-    for lex in vocab:
-        print(lex.orth_)
 
     remap = vocab.prune_vectors(2)
-    assert remap == {u'kitten': (u'cat', cosine(data[0], data[2]))}
-    #print(remap)
+    assert list(remap.keys()) == [u'kitten']
+    neighbour, similarity = remap.values()[0]
+    assert neighbour == u'cat'
+    assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6)
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 6143986fb..cfc81bbe9 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -281,24 +281,26 @@ cdef class Vocab:
         toss = self.vectors.data[nr_row:]
         # Normalize the vectors, so cosine similarity is just dot product.
         # Note we can't modify the ones we're keeping in-place...
-        keep = keep / (xp.linalg.norm(keep, axis=1, keepdims=True)+1e-8)
+        keep = keep / (xp.linalg.norm(keep, axis=1, keepdims=True)+1e-12)
         keep = xp.ascontiguousarray(keep.T)
         neighbours = xp.zeros((toss.shape[0],), dtype='i')
         scores = xp.zeros((toss.shape[0],), dtype='f')
         for i in range(0, toss.shape[0], batch_size):
             batch = toss[i : i+batch_size]
-            batch /= xp.linalg.norm(batch, axis=1, keepdims=True)+1e-8
+            batch /= xp.linalg.norm(batch, axis=1, keepdims=True)+1e-12
             sims = xp.dot(batch, keep)
             matches = sims.argmax(axis=1)
             neighbours[i:i+batch_size] = matches
             scores[i:i+batch_size] = sims.max(axis=1)
-        for lex in self:
+        i2k = {i: key for key, i in self.vectors.key2row.items()}
+        remap = {}
+        for lex in list(self):
             # If we're losing the vector for this word, map it to the nearest
             # vector we're keeping.
             if lex.rank >= nr_row:
                 lex.rank = neighbours[lex.rank-nr_row]
                 self.vectors.add(lex.orth, row=lex.rank)
-                remap[lex.orth_] = (i2k[lex.rank], scores[lex.rank])
+                remap[lex.orth_] = (self.strings[i2k[lex.rank]], scores[lex.rank])
         for key, row in self.vectors.key2row.items():
             if row >= nr_row:
                 self.vectors.key2row[key] = neighbours[row-nr_row]

From be5b635388cfce277e31afccfad03ac4cb4bf687 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Tue, 31 Oct 2017 13:37:55 +0100
Subject: [PATCH 40/90] Remove "needs model" and add info about models (see
 #1471)

---
 website/usage/spacy-101.jade | 59 ++++++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 20 deletions(-)

diff --git a/website/usage/spacy-101.jade b/website/usage/spacy-101.jade
index 8a2741e71..a9fd97508 100644
--- a/website/usage/spacy-101.jade
+++ b/website/usage/spacy-101.jade
@@ -88,80 +88,94 @@ p
         |  while others are related to more general machine learning
         |  functionality.
 
-    +aside
-        |  If one of spaCy's functionalities #[strong needs a model], it means
-        |  that you need to have one of the available
-        |  #[+a("/models") statistical models] installed. Models are used
-        |  to #[strong predict] linguistic annotations – for example, if a word
-        |  is a verb or a noun.
-
-    +table(["Name", "Description", "Needs model"])
+    +table(["Name", "Description"])
         +row
             +cell #[strong Tokenization]
             +cell Segmenting text into words, punctuations marks etc.
-            +cell #[+procon("no", "no", true)]
 
         +row
             +cell #[strong Part-of-speech] (POS) #[strong Tagging]
             +cell Assigning word types to tokens, like verb or noun.
-            +cell #[+procon("yes", "yes", true)]
 
         +row
             +cell #[strong Dependency Parsing]
             +cell
                 |  Assigning syntactic dependency labels, describing the
                 |  relations between individual tokens, like subject or object.
-            +cell #[+procon("yes", "yes", true)]
 
         +row
             +cell #[strong Lemmatization]
             +cell
                 |  Assigning the base forms of words. For example, the lemma of
                 |  "was" is "be", and the lemma of "rats" is "rat".
-            +cell #[+procon("no", "no", true)]
 
         +row
             +cell #[strong Sentence Boundary Detection] (SBD)
             +cell Finding and segmenting individual sentences.
-            +cell #[+procon("yes", "yes", true)]
 
         +row
             +cell #[strong Named Entity Recongition] (NER)
             +cell
                 |  Labelling named "real-world" objects, like persons, companies
                 |  or locations.
-            +cell #[+procon("yes", "yes", true)]
 
         +row
             +cell #[strong Similarity]
             +cell
                 |  Comparing words, text spans and documents and how similar
                 |  they are to each other.
-            +cell #[+procon("yes", "yes", true)]
 
         +row
             +cell #[strong Text Classification]
             +cell
                 |  Assigning categories or labels to a whole document, or parts
                 |  of a document.
-            +cell #[+procon("yes", "yes", true)]
 
         +row
             +cell #[strong Rule-based Matching]
             +cell
                 |  Finding sequences of tokens based on their texts and
                 |  linguistic annotations, similar to regular expressions.
-            +cell #[+procon("no", "no", true)]
 
         +row
             +cell #[strong Training]
             +cell Updating and improving a statistical model's predictions.
-            +cell #[+procon("no", "no", true)]
 
         +row
             +cell #[strong Serialization]
             +cell Saving objects to files or byte strings.
-            +cell #[+procon("no", "no", true)]
+
+    +h(3, "statistical-models") Statistical models
+
+    p
+        |  While some of spaCy's features work independently, others require
+        |  #[+a("/models")  statistical models] to be loaded, which enable spaCy
+        |  to #[strong predict] linguistic annotations – for example,
+        |  whether a word is a verb or a noun. spaCy currently offers statistical
+        |  models for #[strong #{MODEL_LANG_COUNT} languages], which can be
+        |  installed as individual Python modules. Models can differ in size,
+        |  speed, memory usage, accuracy and the data they include. The model
+        |  you choose always depends on your use case and the texts you're
+        |  working with. For a general-purpose use case, the small, default
+        |  models are always a good start. They typically include the following
+        |  components:
+
+    +list
+        +item
+            |  #[strong Binary weights] for the part-of-speech tagger,
+            |  dependency parser and named entity recognizer to predict those
+            |  annotations in context.
+        +item
+            |  #[strong Lexical entries] in the vocabulary, i.e. words and their
+            |  context-independent attributes like the shape or spelling.
+        +item
+            |  #[strong Word vectors], i.e. multi-dimensional meaning
+            |  representations of words that let you determine how similar they
+            |  are to each other.
+        +item
+            |  #[strong Configuration] options, like the language and
+            |  processing pipeline settings, to put spaCy in the correct state
+            |  when you load in the model.
 
     +h(2, "annotations") Linguistic annotations
 
@@ -174,8 +188,13 @@ p
         |  or the object – or whether "google" is used as a verb, or refers to
         |  the website or company in a specific context.
 
+    +aside-code("Loading models", "bash", "$").
+        spacy download en
+        &gt;&gt;&gt; import spacy
+        &gt;&gt;&gt; nlp = spacy.load('en')
+
     p
-        |  Once you've downloaded and installed a #[+a("/usage/models") model],
+        |  Once you've #[+a("/usage/models") downloaded and installed] a model,
         |  you can load it via #[+api("spacy#load") #[code spacy.load()]]. This will
         |  return a #[code Language] object contaning all components and data needed
         |  to process text. We usually call it #[code nlp]. Calling the #[code nlp]

From 34ca59691b6db947679552e899ca7209e41451db Mon Sep 17 00:00:00 2001
From: Jim O'Regan <jaoregan@tcd.ie>
Date: Tue, 31 Oct 2017 14:50:13 +0000
Subject: [PATCH 41/90] no idea what is wrong here

---
 spacy/tests/conftest.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 5fa0c0cb7..ee4093db3 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -14,9 +14,8 @@ from .. import util
 # These languages are used for generic tokenizer tests – only add a language
 # here if it's using spaCy's tokenizer (not a different library)
 # TODO: re-implement generic tokenizer tests
-_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
-              'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'ga', 'xx']
-
+_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id',
+              'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
 _models = {'en': ['en_core_web_sm'],
            'de': ['de_core_news_md'],
            'fr': ['fr_depvec_web_lg'],
@@ -107,6 +106,7 @@ def sv_tokenizer():
 def bn_tokenizer():
     return util.get_lang_class('bn').Defaults.create_tokenizer()
 
+
 @pytest.fixture
 def ga_tokenizer():
     return util.get_lang_class('ga').Defaults.create_tokenizer()

From d4a8160c3641f122396f0fe49e39459ce952ab9f Mon Sep 17 00:00:00 2001
From: Jim O'Regan <jaoregan@tcd.ie>
Date: Tue, 31 Oct 2017 15:15:44 +0000
Subject: [PATCH 42/90] change quotes

---
 spacy/tests/lang/ga/test_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py
index 5b45dddc1..9cfbd555e 100644
--- a/spacy/tests/lang/ga/test_tokenizer.py
+++ b/spacy/tests/lang/ga/test_tokenizer.py
@@ -5,7 +5,7 @@ import pytest
 
 
 GA_TOKEN_EXCEPTION_TESTS = [
-    ('B\'fhearr fanacht as amharc', ['B\'', 'fhearr', 'fanacht', 'as', 'amharc']),
+    ("B'fhearr fanacht as amharc", ["B'", "fhearr", "fanacht", "as", "amharc"]),
     ('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise'])
 ]
 

From 77d8f5de9a5627f99ac960a5674c3240b47919c5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 31 Oct 2017 18:25:08 +0100
Subject: [PATCH 43/90] Revise and simplify Vectors class

---
 spacy/tests/doc/test_doc_api.py       |   4 +-
 spacy/tests/doc/test_token_api.py     |   6 +-
 spacy/tests/util.py                   |   4 +-
 spacy/tests/vectors/test_vectors.py   |  21 +-
 spacy/tests/vocab/test_add_vectors.py |   2 +-
 spacy/vectors.pyx                     | 341 ++++++++++++++++----------
 spacy/vocab.pyx                       |  84 ++++---
 7 files changed, 275 insertions(+), 187 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 8f881e811..2c90572e3 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -208,8 +208,8 @@ def test_doc_api_right_edge(en_tokenizer):
 
 def test_doc_api_has_vector():
     vocab = Vocab()
-    vocab.clear_vectors(2)
-    vocab.vectors.add('kitten', vector=numpy.asarray([0., 2.], dtype='f'))
+    vocab.reset_vectors(width=2)
+    vocab.set_vector('kitten', vector=numpy.asarray([0., 2.], dtype='f'))
     doc = Doc(vocab, words=['kitten'])
     assert doc.has_vector
 
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index a52be9731..c02904905 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -72,9 +72,9 @@ def test_doc_token_api_is_properties(en_vocab):
 
 def test_doc_token_api_vectors():
     vocab = Vocab()
-    vocab.clear_vectors(2)
-    vocab.vectors.add('apples', vector=numpy.asarray([0., 2.], dtype='f'))
-    vocab.vectors.add('oranges', vector=numpy.asarray([0., 1.], dtype='f'))
+    vocab.reset_vectors(width=2)
+    vocab.set_vector('apples', vector=numpy.asarray([0., 2.], dtype='f'))
+    vocab.set_vector('oranges', vector=numpy.asarray([0., 1.], dtype='f'))
     doc = Doc(vocab, words=['apples', 'oranges', 'oov'])
     assert doc.has_vector
 
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 2f474a926..2de97583c 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -79,9 +79,9 @@ def add_vecs_to_vocab(vocab, vectors):
     """Add list of vector tuples to given vocab. All vectors need to have the
     same length. Format: [("text", [1, 2, 3])]"""
     length = len(vectors[0][1])
-    vocab.clear_vectors(length)
+    vocab.reset_vectors(width=length)
     for word, vec in vectors:
-        vocab.set_vector(word, vec)
+        vocab.set_vector(word, vector=vec)
     return vocab
 
 
diff --git a/spacy/tests/vectors/test_vectors.py b/spacy/tests/vectors/test_vectors.py
index 74ac26a10..ce183f9fd 100644
--- a/spacy/tests/vectors/test_vectors.py
+++ b/spacy/tests/vectors/test_vectors.py
@@ -35,20 +35,18 @@ def vocab(en_vocab, vectors):
 
 
 def test_init_vectors_with_data(strings, data):
-    v = Vectors(strings, data=data)
+    v = Vectors(data=data)
     assert v.shape == data.shape
 
-def test_init_vectors_with_width(strings):
-    v = Vectors(strings, width=3)
-    for string in strings:
-        v.add(string)
+def test_init_vectors_with_shape(strings):
+    v = Vectors(shape=(len(strings), 3))
     assert v.shape == (len(strings), 3)
 
 
 def test_get_vector(strings, data):
-    v = Vectors(strings, data=data)
-    for string in strings:
-        v.add(string)
+    v = Vectors(data=data)
+    for i, string in enumerate(strings):
+        v.add(string, row=i)
     assert list(v[strings[0]]) == list(data[0])
     assert list(v[strings[0]]) != list(data[1])
     assert list(v[strings[1]]) != list(data[0])
@@ -56,9 +54,9 @@ def test_get_vector(strings, data):
 
 def test_set_vector(strings, data):
     orig = data.copy()
-    v = Vectors(strings, data=data)
-    for string in strings:
-        v.add(string)
+    v = Vectors(data=data)
+    for i, string in enumerate(strings):
+        v.add(string, row=i)
     assert list(v[strings[0]]) == list(orig[0])
     assert list(v[strings[0]]) != list(orig[1])
     v[strings[0]] = data[1]
@@ -66,7 +64,6 @@ def test_set_vector(strings, data):
     assert list(v[strings[0]]) != list(orig[0])
 
 
-
 @pytest.fixture()
 def tokenizer_v(vocab):
     return Tokenizer(vocab, {}, None, None, None)
diff --git a/spacy/tests/vocab/test_add_vectors.py b/spacy/tests/vocab/test_add_vectors.py
index 3cb0b632c..3dcce67cc 100644
--- a/spacy/tests/vocab/test_add_vectors.py
+++ b/spacy/tests/vocab/test_add_vectors.py
@@ -36,5 +36,5 @@ def test_vocab_prune_vectors():
     remap = vocab.prune_vectors(2)
     assert list(remap.keys()) == [u'kitten']
     neighbour, similarity = remap.values()[0]
-    assert neighbour == u'cat'
+    assert neighbour == u'cat', remap
     assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6)
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 0adeabe4d..cb9f0c0e6 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -15,6 +15,12 @@ from .compat import basestring_, path2str
 from . import util
 
 
+def unpickle_vectors(keys_and_rows, data):
+    vectors = Vectors(data=data)
+    for key, row in keys_and_rows:
+        vectors.add(key, row=row)
+
+
 cdef class Vectors:
     """Store, save and load word vectors.
 
@@ -23,130 +29,35 @@ cdef class Vectors:
     (for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
     rows in the vectors.data table.
     
-    Multiple keys can be mapped to the same vector, so len(keys) may be greater
-    (but not smaller) than data.shape[0].
+    Multiple keys can be mapped to the same vector, and not all of the rows in
+    the table need to be assigned --- so len(list(vectors.keys())) may be
+    greater or smaller than vectors.shape[0].
     """
     cdef public object data
-    cdef readonly StringStore strings
     cdef public object key2row
-    cdef public int _i_vec
+    cdef public object _unset
 
-    def __init__(self, strings, width=0, data=None):
-        """Create a new vector store. To keep the vector table empty, pass
-        `width=0`. You can also create the vector table and add vectors one by
-        one, or set the vector values directly on initialisation.
-
-        strings (StringStore or list): List of strings or StringStore that maps
-            strings to hash values, and vice versa.
-        width (int): Number of dimensions.
+    def __init__(self, *, shape=None, data=None, keys=None):
+        """Create a new vector store.
+        
+        shape (tuple): Size of the table, as (# entries, # columns)
         data (numpy.ndarray): The vector data.
         RETURNS (Vectors): The newly created object.
         """
-        if isinstance(strings, StringStore):
-            self.strings = strings
+        if data is None:
+            if shape is None:
+                shape = (0,0)
+            data = numpy.zeros(shape, dtype='f')
+        self.data = data
+        self.key2row = OrderedDict()
+        if self.data is not None:
+            self._unset = set(range(self.data.shape[0]))
         else:
-            self.strings = StringStore()
-            for string in strings:
-                self.strings.add(string)
-        if data is not None:
-            self.data = numpy.asarray(data, dtype='f')
-        else:
-            self.data = numpy.zeros((len(self.strings), width), dtype='f')
-        self._i_vec = 0
-        self.key2row = {}
-        if data is not None:
-            for i, string in enumerate(self.strings):
-                if i >= self.data.shape[0]:
-                    break
-                self.add(self.strings[string], vector=self.data[i])
-
-    def __reduce__(self):
-        return (Vectors, (self.strings, self.data))
-
-    def __getitem__(self, key):
-        """Get a vector by key. If key is a string, it is hashed to an integer
-        ID using the vectors.strings table. If the integer key is not found in
-        the table, a KeyError is raised.
-
-        key (unicode / int): The key to get the vector for.
-        RETURNS (numpy.ndarray): The vector for the key.
-        """
-        if isinstance(key, basestring):
-            key = self.strings[key]
-        i = self.key2row[key]
-        if i is None:
-            raise KeyError(key)
-        else:
-            return self.data[i]
-
-    def __setitem__(self, key, vector):
-        """Set a vector for the given key. If key is a string, it is hashed
-        to an integer ID using the vectors.strings table.
-
-        key (unicode / int): The key to set the vector for.
-        vector (numpy.ndarray): The vector to set.
-        """
-        if isinstance(key, basestring):
-            key = self.strings.add(key)
-        i = self.key2row[key]
-        self.data[i] = vector
-
-    def __iter__(self):
-        """Yield vectors from the table.
-
-        YIELDS (numpy.ndarray): A vector.
-        """
-        yield from self.data
-
-    def __len__(self):
-        """Return the number of vectors that have been assigned.
-
-        RETURNS (int): The number of vectors in the data.
-        """
-        return self._i_vec
-
-    def __contains__(self, key):
-        """Check whether a key has a vector entry in the table.
-
-        key (unicode / int): The key to check.
-        RETURNS (bool): Whether the key has a vector entry.
-        """
-        if isinstance(key, basestring_):
-            key = self.strings[key]
-        return key in self.key2row
-
-    def add(self, key, *, vector=None, row=None):
-        """Add a key to the table. Keys can be mapped to an existing vector
-        by setting `row`, or a new vector can be added.
-
-        key (unicode / int): The key to add.
-        vector (numpy.ndarray / None): A vector to add for the key.
-        row (int / None): The row-number of a vector to map the key to.
-        """
-        if isinstance(key, basestring_):
-            key = self.strings.add(key)
-        if row is None and key in self.key2row:
-            row = self.key2row[key]
-        elif row is None:
-            row = self._i_vec
-            self._i_vec += 1
-        if row >= self.data.shape[0]:
-            self.data.resize((row*2, self.data.shape[1]))
-
-        self.key2row[key] = row
-        if vector is not None:
-            self.data[row] = vector
-        return row
-
-    def items(self):
-        """Iterate over `(string key, vector)` pairs, in order.
-
-        YIELDS (tuple): A key/vector pair.
-        """
-        for key, row in self.key2row.items():
-            string = self.strings[key]
-            yield string, self.data[row]
-
+            self._unset = set()
+        if keys is not None:
+            for i, key in enumerate(keys):
+                self.add(key, row=i)
+    
     @property
     def shape(self):
         """Get `(rows, dims)` tuples of number of rows and number of dimensions
@@ -156,9 +67,179 @@ cdef class Vectors:
         """
         return self.data.shape
 
-    def most_similar(self, key):
-        # TODO: implement
-        raise NotImplementedError
+    @property
+    def size(self):
+        """Return rows*dims"""
+        return self.data.shape[0] * self.data.shape[1]
+
+    @property
+    def is_full(self):
+        """Returns True if no keys are available for new keys."""
+        return len(self._unset) == 0
+
+    def __reduce__(self):
+        keys_and_rows = self.key2row.items()
+        return (unpickle_vectors, (keys_and_rows, self.data))
+
+    def __getitem__(self, key):
+        """Get a vector by key. If the key is not found, a KeyError is raised.
+
+        key (int): The key to get the vector for.
+        RETURNS (ndarray): The vector for the key.
+        """
+        i = self.key2row[key]
+        if i is None:
+            raise KeyError(key)
+        else:
+            return self.data[i]
+
+    def __setitem__(self, key, vector):
+        """Set a vector for the given key.
+
+        key (int): The key to set the vector for.
+        vector (numpy.ndarray): The vector to set.
+        """
+        i = self.key2row[key]
+        self.data[i] = vector
+        if i in self._unset:
+            self._unset.remove(i)
+
+    def __iter__(self):
+        """Yield vectors from the table.
+
+        YIELDS (ndarray): A vector.
+        """
+        yield from self.key2row
+
+    def __len__(self):
+        """Return the number of vectors in the table.
+
+        RETURNS (int): The number of vectors in the data.
+        """
+        return self.data.shape[0]
+
+    def __contains__(self, key):
+        """Check whether a key has been mapped to a vector entry in the table.
+
+        key (int): The key to check.
+        RETURNS (bool): Whether the key has a vector entry.
+        """
+        return key in self.key2row
+
+    def resize(self, shape, inplace=False):
+        '''Resize the underlying vectors array. If inplace=True, the memory
+        is reallocated. This may cause other references to the data to become
+        invalid, so only use inplace=True if you're sure that's what you want.
+
+        If the number of vectors is reduced, keys mapped to rows that have been
+        deleted are removed. These removed items are returned as a list of
+        (key, row) tuples.
+        '''
+        if inplace:
+            self.data.resize(shape, refcheck=False)
+        else:
+            xp = get_array_module(self.data)
+            self.data = xp.resize(self.data, shape)
+        filled = {row for row in self.key2row.values()}
+        self._unset = {row for row in range(shape[0]) if row not in filled}
+        removed_items = []
+        for key, row in dict(self.key2row.items()):
+            if row >= shape[0]:
+                self.key2row.pop(key)
+                removed_items.append((key, row))
+        return removed_items
+    
+    def keys(self):
+        '''Iterate over the keys in the table.'''
+        yield from self.key2row.keys()
+    
+    def values(self):
+        '''Iterate over vectors that have been assigned to at least one key.
+
+        Note that some vectors may be unassigned, so the number of vectors
+        returned may be less than the length of the vectors table.'''
+        for row, vector in enumerate(range(self.data.shape[0])):
+            if row not in self._unset:
+                yield vector
+
+    def items(self):
+        """Iterate over `(key, vector)` pairs.
+
+        YIELDS (tuple): A key/vector pair.
+        """
+        for key, row in self.key2row.items():
+            yield key, self.data[row]
+
+    def get_keys(self, rows):
+        xp = get_array_module(self.data)
+        row2key = {row: key for key, row in self.key2row.items()}
+        keys = xp.asarray([row2key[row] for row in rows],
+                           dtype='uint64')
+        return keys
+
+    def get_rows(self, keys):
+        xp = get_array_module(self.data)
+        k2r = self.key2row
+        return xp.asarray([k2r.get(key, -1) for key in keys], dtype='i')
+
+    def add(self, key, *, vector=None, row=None):
+        """Add a key to the table. Keys can be mapped to an existing vector
+        by setting `row`, or a new vector can be added.
+
+        key (unicode / int): The key to add.
+        vector (numpy.ndarray / None): A vector to add for the key.
+        row (int / None): The row-number of a vector to map the key to.
+        """
+        if row is None and key in self.key2row:
+            row = self.key2row[key]
+        elif row is None:
+            if self.is_full:
+                raise ValueError("Cannot add new key to vectors -- full")
+            row = min(self._unset)
+
+        self.key2row[key] = row
+        if vector is not None:
+            self.data[row] = vector
+            if row in self._unset:
+                self._unset.remove(row)
+        return row
+    
+    def most_similar(self, queries, *, return_scores=False, return_rows,
+            batch_size=1024):
+        '''For each of the given vectors, find the single entry most similar
+        to it, by cosine.
+        
+        Queries are by vector. Results are returned as an array of keys,
+        or a tuple of (keys, scores) if return_scores=True. If `queries` is
+        large, the calculations are performed in chunks, to avoid consuming
+        too much memory. You can set the `batch_size` to control the size/space
+        trade-off during the calculations.
+        '''
+        xp = get_array_module(self.data)
+        
+        vectors = self.data / xp.linalg.norm(self.data, axis=1, keepdims=True)
+        
+        best_rows = xp.zeros((queries.shape[0],), dtype='i')
+        scores = xp.zeros((queries.shape[0],), dtype='f')
+        # Work in batches, to avoid memory problems.
+        for i in range(0, queries.shape[0], batch_size):
+            batch = queries[i : i+batch_size]
+            batch /= xp.linalg.norm(batch, axis=1, keepdims=True)
+            # batch   e.g. (1024, 300)
+            # vectors e.g. (10000, 300)
+            # sims    e.g. (1024, 10000)
+            sims = xp.dot(batch, vectors.T)
+            best_rows[i:i+batch_size] = sims.argmax(axis=1)
+            scores[i:i+batch_size] = sims.max(axis=1)
+        keys = self.get_keys(best_rows)
+        if return_rows and return_scores:
+            return (keys, best_rows, scores)
+        elif return_rows:
+            return (keys, best_rows)
+        elif return_scores:
+            return (keys, scores)
+        else:
+            return keys
 
     def from_glove(self, path):
         """Load GloVe vectors from a directory. Assumes binary format,
@@ -168,27 +249,33 @@ cdef class Vectors:
         By default GloVe outputs 64-bit vectors.
 
         path (unicode / Path): The path to load the GloVe vectors from.
+
+        RETURNS: A StringStore object, holding the key-to-string mapping.
         """
         path = util.ensure_path(path)
+        width = None
         for name in path.iterdir():
             if name.parts[-1].startswith('vectors'):
                 _, dims, dtype, _2 = name.parts[-1].split('.')
-                self.width = int(dims)
+                width = int(dims)
                 break
         else:
             raise IOError("Expected file named e.g. vectors.128.f.bin")
         bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims,
                                                              dtype=dtype)
+        xp = get_array_module(self.data)
+        self.data = None
         with bin_loc.open('rb') as file_:
-            self.data = numpy.fromfile(file_, dtype='float64')
-            self.data = numpy.ascontiguousarray(self.data, dtype='float32')
+            self.data = xp.fromfile(file_, dtype=dtype)
+            if dtype != 'float32':
+                self.data = xp.ascontiguousarray(self.data, dtype='float32')
         n = 0
+        strings = StringStore()
         with (path / 'vocab.txt').open('r') as file_:
-            for line in file_:
-                self.add(line.strip())
-                n += 1
-        if (self.data.size % self.width) == 0:
-            self.data
+            for i, line in enumerate(file_):
+                key = strings.add(line.strip())
+                self.add(key, row=i)
+        return strings
 
     def to_disk(self, path, **exclude):
         """Save the current state to a directory.
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index cfc81bbe9..ffc81ad0b 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -55,7 +55,7 @@ cdef class Vocab:
                 _ = self[string]
         self.lex_attr_getters = lex_attr_getters
         self.morphology = Morphology(self.strings, tag_map, lemmatizer)
-        self.vectors = Vectors(self.strings, width=0)
+        self.vectors = Vectors()
 
     property lang:
         def __get__(self):
@@ -241,15 +241,19 @@ cdef class Vocab:
     def vectors_length(self):
         return self.vectors.data.shape[1]
 
-    def clear_vectors(self, width=None):
+    def reset_vectors(self, *, width=None, shape=None):
         """Drop the current vector table. Because all vectors must be the same
         width, you have to call this to change the size of the vectors.
         """
-        if width is None:
-            width = self.vectors.data.shape[1]
-        self.vectors = Vectors(self.strings, width=width)
+        if width is not None and shape is not None:
+            raise ValueError("Only one of width and shape can be specified")
+        elif shape is not None:
+            self.vectors = Vectors(shape=shape)
+        else:
+            width = width if width is not None else self.vectors.data.shape[1]
+            self.vectors = Vectors(shape=(self.vectors.shape[0], width))
 
-    def prune_vectors(self, nr_row, batch_size=8):
+    def prune_vectors(self, nr_row, batch_size=1024):
         """Reduce the current vector table to `nr_row` unique entries. Words
         mapped to the discarded vectors will be remapped to the closest vector
         among those remaining.
@@ -275,37 +279,29 @@ cdef class Vocab:
             two words.
         """
         xp = get_array_module(self.vectors.data)
-        # Work in batches, to avoid memory problems.
-        keep = self.vectors.data[:nr_row]
-        keep_keys = [key for key, row in self.vectors.key2row.items() if row < nr_row]
-        toss = self.vectors.data[nr_row:]
-        # Normalize the vectors, so cosine similarity is just dot product.
-        # Note we can't modify the ones we're keeping in-place...
-        keep = keep / (xp.linalg.norm(keep, axis=1, keepdims=True)+1e-12)
-        keep = xp.ascontiguousarray(keep.T)
-        neighbours = xp.zeros((toss.shape[0],), dtype='i')
-        scores = xp.zeros((toss.shape[0],), dtype='f')
-        for i in range(0, toss.shape[0], batch_size):
-            batch = toss[i : i+batch_size]
-            batch /= xp.linalg.norm(batch, axis=1, keepdims=True)+1e-12
-            sims = xp.dot(batch, keep)
-            matches = sims.argmax(axis=1)
-            neighbours[i:i+batch_size] = matches
-            scores[i:i+batch_size] = sims.max(axis=1)
-        i2k = {i: key for key, i in self.vectors.key2row.items()}
+        # Make prob negative so it sorts by rank ascending
+        # (key2row contains the rank)
+        priority = [(-lex.prob, self.vectors.key2row[lex.orth], lex.orth)
+                    for lex in self if lex.orth in self.vectors.key2row]
+        priority.sort()
+        indices = xp.asarray([i for (prob, i, key) in priority], dtype='i')
+        keys = xp.asarray([key for (prob, i, key) in priority], dtype='uint64')
+        
+        keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
+        toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
+
+        self.vectors = Vectors(data=keep, keys=keys)
+
+        syn_keys, syn_rows, scores = self.vectors.most_similar(toss,
+                                        return_rows=True, return_scores=True)
+
         remap = {}
-        for lex in list(self):
-            # If we're losing the vector for this word, map it to the nearest
-            # vector we're keeping.
-            if lex.rank >= nr_row:
-                lex.rank = neighbours[lex.rank-nr_row]
-                self.vectors.add(lex.orth, row=lex.rank)
-                remap[lex.orth_] = (self.strings[i2k[lex.rank]], scores[lex.rank])
-        for key, row in self.vectors.key2row.items():
-            if row >= nr_row:
-                self.vectors.key2row[key] = neighbours[row-nr_row]
-        # Make copy, to encourage the original table to be garbage collected.
-        self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row])
+        for i, key in enumerate(keys[nr_row:]):
+            self.vectors.add(key, row=syn_rows[i])
+            word = self.strings[key]
+            synonym = self.strings[syn_keys[i]]
+            score = scores[i]
+            remap[word] = (synonym, score)
         link_vectors_to_models(self)
         return remap
 
@@ -329,11 +325,19 @@ cdef class Vocab:
         """Set a vector for a word in the vocabulary. Words can be referenced
         by string or int ID.
         """
-        if self.vectors.data.size == 0:
-            self.clear_vectors(vector.shape[0])
-        lex = self[orth]
+        if isinstance(orth, basestring_):
+            orth = self.strings.add(orth)
+        if self.vectors.is_full and orth not in self.vectors:
+            new_rows = max(100, int(self.vectors.shape[0]*1.3))
+            if self.vectors.shape[1] == 0:
+                width = vector.size
+            else:
+                width = self.vectors.shape[1]
+            self.vectors.resize((new_rows, width))
+            print(self.vectors.shape)
+            self.vectors.add(orth, vector=vector)
+        print("Adding", orth, self.vectors.is_full)
         self.vectors.add(orth, vector=vector)
-        lex.rank = self.vectors.key2row[lex.orth]
 
     def has_vector(self, orth):
         """Check whether a word has a vector. Returns False if no vectors have

From 59203a2e8a9efa178f94b07a439ee9d97842b8ae Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 31 Oct 2017 19:10:01 +0100
Subject: [PATCH 44/90] Move vector pruning command into spacy vocab cli tool

---
 spacy/cli/vocab.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/spacy/cli/vocab.py b/spacy/cli/vocab.py
index d05eff3f0..9847dd3d8 100644
--- a/spacy/cli/vocab.py
+++ b/spacy/cli/vocab.py
@@ -7,6 +7,7 @@ import spacy
 import numpy
 from pathlib import Path
 
+from ..vectors import Vectors
 from ..util import prints, ensure_path
 
 
@@ -16,8 +17,12 @@ from ..util import prints, ensure_path
     lexemes_loc=("location of JSONL-formatted lexical data", "positional",
                  None, Path),
     vectors_loc=("optional: location of vectors data, as numpy .npz",
-                 "positional", None, str))
-def make_vocab(cmd, lang, output_dir, lexemes_loc, vectors_loc=None):
+                 "positional", None, str),
+    prune_vectors=("optional: number of vectors to prune to.",
+                   "option", "V", int)
+)
+def make_vocab(cmd, lang, output_dir, lexemes_loc,
+               vectors_loc=None, prune_vectors=0):
     """Compile a vocabulary from a lexicon jsonl file and word vectors."""
     if not lexemes_loc.exists():
         prints(lexemes_loc, title="Can't find lexical data", exits=1)
@@ -26,7 +31,6 @@ def make_vocab(cmd, lang, output_dir, lexemes_loc, vectors_loc=None):
     for word in nlp.vocab:
         word.rank = 0
     lex_added = 0
-    vec_added = 0
     with lexemes_loc.open() as file_:
         for line in file_:
             if line.strip():
@@ -39,16 +43,18 @@ def make_vocab(cmd, lang, output_dir, lexemes_loc, vectors_loc=None):
                     assert lex.rank == attrs['id']
                 lex_added += 1
     if vectors_loc is not None:
-        vector_data = numpy.load(open(vectors_loc, 'rb'))
-        nlp.vocab.clear_vectors(width=vector_data.shape[1])
+        vector_data = numpy.load(vectors_loc.open('rb'))
+        nlp.vocab.vectors = Vectors(data=vector_data)
         for word in nlp.vocab:
             if word.rank:
-                nlp.vocab.vectors.add(word.orth_, row=word.rank,
-                                      vector=vector_data[word.rank])
-                vec_added += 1
+                nlp.vocab.vectors.add(word.orth, row=word.rank)
+
+    if prune_vectors is not None:
+        remap = nlp.vocab.prune_vectors(prune_vectors)
     if not output_dir.exists():
         output_dir.mkdir()
     nlp.to_disk(output_dir)
+    vec_added = len(nlp.vocab.vectors)
     prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
            title="Sucessfully compiled vocab and vectors, and saved model")
     return nlp

From 9b0de9fb43fc5fccaeb3115c5b24c123b45e89ab Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Tue, 31 Oct 2017 19:17:58 +0100
Subject: [PATCH 45/90] Fix import of symbols (now nested one level lower)

---
 spacy/lang/ga/tokenizer_exceptions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py
index 7d29f4bcc..70ee051e9 100644
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@@ -1,7 +1,7 @@
 # encoding: utf8
 from __future__ import unicode_literals
 
-from ..symbols import ORTH, LEMMA, NORM, POS
+from ...symbols import ORTH, LEMMA, NORM, POS
 
 
 _exc = {

From 3659a807b097b0fd54e63a806d2d871f6a5fa02a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 31 Oct 2017 19:21:05 +0100
Subject: [PATCH 46/90] Remove vector pruning arg from train CLI

---
 spacy/cli/train.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 34117db22..74e1d6d68 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -32,7 +32,6 @@ numpy.random.seed(0)
     n_sents=("number of sentences", "option", "ns", int),
     use_gpu=("Use GPU", "option", "g", int),
     vectors=("Model to load vectors from", "option", "v"),
-    vectors_limit=("Truncate to N vectors (requires -v)", "option", None, int),
     no_tagger=("Don't train tagger", "flag", "T", bool),
     no_parser=("Don't train parser", "flag", "P", bool),
     no_entities=("Don't train NER", "flag", "N", bool),
@@ -41,7 +40,7 @@ numpy.random.seed(0)
     meta_path=("Optional path to meta.json. All relevant properties will be "
                "overwritten.", "option", "m", Path))
 def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
-          use_gpu=-1, vectors=None, vectors_limit=None, no_tagger=False,
+          use_gpu=-1, vectors=None, no_tagger=False,
           no_parser=False, no_entities=False, gold_preproc=False,
           version="0.0.0", meta_path=None):
     """
@@ -95,8 +94,6 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
     nlp.meta.update(meta)
     if vectors:
         util.load_model(vectors, vocab=nlp.vocab)
-        if vectors_limit is not None:
-            nlp.vocab.prune_vectors(vectors_limit)
     for name in pipeline:
         nlp.add_pipe(nlp.create_pipe(name), name=name)
     optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)

From 807572683851262a8234ddef46eec9fa7bebda6c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 31 Oct 2017 19:21:17 +0100
Subject: [PATCH 47/90] Restore vector usage in models

---
 spacy/_ml.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index fa8e83d48..6bfacb20a 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -208,11 +208,11 @@ class PrecomputableAffine(Model):
 def link_vectors_to_models(vocab):
     vectors = vocab.vectors
     ops = Model.ops
-    #for word in vocab:
-    #    if word.orth in vectors.key2row:
-    #        word.rank = vectors.key2row[word.orth]
-    #    else:
-    #        word.rank = 0
+    for word in vocab:
+        if word.orth in vectors.key2row:
+            word.rank = vectors.key2row[word.orth]
+        else:
+            word.rank = 0
     data = ops.asarray(vectors.data)
     # Set an entry here, so that vectors are accessed by StaticVectors
     # (unideal, I know)

From 997a61557abb2e8205294b2ec2a892b17b323a27 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 31 Oct 2017 19:30:52 +0100
Subject: [PATCH 48/90] Add vectors.n_keys property

---
 spacy/vectors.pyx | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index cb9f0c0e6..bde793ec5 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -77,6 +77,11 @@ cdef class Vectors:
         """Returns True if no keys are available for new keys."""
         return len(self._unset) == 0
 
+    @property
+    def n_keys(self):
+        """Returns True if no keys are available for new keys."""
+        return len(self.key2row)
+
     def __reduce__(self):
         keys_and_rows = self.key2row.items()
         return (unpickle_vectors, (keys_and_rows, self.data))
@@ -204,7 +209,7 @@ cdef class Vectors:
                 self._unset.remove(row)
         return row
     
-    def most_similar(self, queries, *, return_scores=False, return_rows,
+    def most_similar(self, queries, *, return_scores=False, return_rows=False,
             batch_size=1024):
         '''For each of the given vectors, find the single entry most similar
         to it, by cosine.

From 147448b65b5781ce692000e3efa13cde526aa6d6 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Tue, 31 Oct 2017 19:34:45 +0100
Subject: [PATCH 49/90] Add missing symbols

---
 spacy/lang/ga/tokenizer_exceptions.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py
index 70ee051e9..185b08895 100644
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@@ -1,7 +1,8 @@
 # encoding: utf8
 from __future__ import unicode_literals
 
-from ...symbols import ORTH, LEMMA, NORM, POS
+from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX
+from ...symbols import ORTH, LEMMA, NORM
 
 
 _exc = {

From d90a22afe66a313db907f6acb3a0068b0de7c6bc Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 31 Oct 2017 19:58:35 +0100
Subject: [PATCH 50/90] Fix loading previous vectors models

---
 spacy/vectors.pyx | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index bde793ec5..08ab586d1 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -307,9 +307,18 @@ cdef class Vectors:
         path (unicode / Path): Directory path, string or Path-like object.
         RETURNS (Vectors): The modified object.
         """
-        def load_keys(path):
+        def load_key2row(path):
             if path.exists():
                 self.key2row = msgpack.load(path.open('rb'))
+            for key, row in self.key2row.items():
+                if row in self._unset:
+                    self._unset.remove(row)
+
+        def load_keys(path):
+            if path.exists():
+                keys = numpy.load(str(path))
+                for i, key in enumerate(keys):
+                    self.add(key, row=i)
 
         def load_vectors(path):
             xp = Model.ops.xp
@@ -317,7 +326,8 @@ cdef class Vectors:
                 self.data = xp.load(path)
 
         serializers = OrderedDict((
-            ('key2row', load_keys),
+            ('key2row', load_key2row),
+            ('keys', load_keys),
             ('vectors', load_vectors),
         ))
         util.from_disk(path, serializers, exclude)

From 06c25a888244e8520d5eeb2df8d5d86499325f48 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Tue, 31 Oct 2017 20:13:16 +0100
Subject: [PATCH 51/90] Remove comma that caused list to wrap in tuple!

Also removed extra dict wrappings for performance (we used to have them in there, but they should only really exist if copying the dict is absolutely necessary)
---
 spacy/lang/ga/tokenizer_exceptions.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py
index 185b08895..e93ada52f 100644
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@@ -24,8 +24,7 @@ _exc = {
 
     "led'": [
         {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP},
-        {ORTH: "d'", LEMMA: "mo", NORM: "do", POS: DET}],
-
+        {ORTH: "d'", LEMMA: "mo", NORM: "do", POS: DET}]
 }
 
 for exc_data in [
@@ -77,11 +76,11 @@ for exc_data in [
     {ORTH: "Uas.", LEMMA: "Uasal", POS: NOUN},
     {ORTH: "uimh.", LEMMA: "uimhir", POS: NOUN},
     {ORTH: "Uimh.", LEMMA: "uimhir", POS: NOUN}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)],
+    _exc[exc_data[ORTH]] = [exc_data]
 
 for orth in [
     "d'", "D'"]:
     _exc[orth] = [{ORTH: orth}]
 
 
-TOKENIZER_EXCEPTIONS = dict(_exc)
+TOKENIZER_EXCEPTIONS = _exc

From c390f2d74506e2588a7ef7b265b03057d1453d62 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 31 Oct 2017 20:14:47 +0100
Subject: [PATCH 52/90] Make it easier to pass explicit no-pruning to vocab

---
 spacy/cli/vocab.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/vocab.py b/spacy/cli/vocab.py
index 9847dd3d8..5f6f58d80 100644
--- a/spacy/cli/vocab.py
+++ b/spacy/cli/vocab.py
@@ -22,7 +22,7 @@ from ..util import prints, ensure_path
                    "option", "V", int)
 )
 def make_vocab(cmd, lang, output_dir, lexemes_loc,
-               vectors_loc=None, prune_vectors=0):
+               vectors_loc=None, prune_vectors=-1):
     """Compile a vocabulary from a lexicon jsonl file and word vectors."""
     if not lexemes_loc.exists():
         prints(lexemes_loc, title="Can't find lexical data", exits=1)
@@ -49,7 +49,7 @@ def make_vocab(cmd, lang, output_dir, lexemes_loc,
             if word.rank:
                 nlp.vocab.vectors.add(word.orth, row=word.rank)
 
-    if prune_vectors is not None:
+    if prune_vectors >= 1:
         remap = nlp.vocab.prune_vectors(prune_vectors)
     if not output_dir.exists():
         output_dir.mkdir()

From 7e424a18049be54890a68111e3d9ff679dc7d723 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Tue, 31 Oct 2017 21:05:29 +0100
Subject: [PATCH 53/90] Don't copy exception dicts if not necessary and tidy up

---
 spacy/lang/bn/tokenizer_exceptions.py |  2 +-
 spacy/lang/da/tokenizer_exceptions.py |  2 +-
 spacy/lang/de/tokenizer_exceptions.py |  2 +-
 spacy/lang/en/tokenizer_exceptions.py |  2 +-
 spacy/lang/es/tokenizer_exceptions.py |  2 +-
 spacy/lang/fi/tokenizer_exceptions.py |  2 +-
 spacy/lang/fr/tokenizer_exceptions.py |  2 +-
 spacy/lang/hu/tokenizer_exceptions.py |  2 +-
 spacy/lang/id/tokenizer_exceptions.py |  3 +-
 spacy/lang/nb/tokenizer_exceptions.py |  2 +-
 spacy/lang/pl/tokenizer_exceptions.py |  2 +-
 spacy/lang/pt/tokenizer_exceptions.py |  2 +-
 spacy/lang/sv/tokenizer_exceptions.py |  2 +-
 spacy/lang/th/tag_map.py              | 16 ++++----
 spacy/lang/th/tokenizer_exceptions.py | 56 +++++++++------------------
 15 files changed, 39 insertions(+), 60 deletions(-)

diff --git a/spacy/lang/bn/tokenizer_exceptions.py b/spacy/lang/bn/tokenizer_exceptions.py
index f6e2c9ed9..5c6de139b 100644
--- a/spacy/lang/bn/tokenizer_exceptions.py
+++ b/spacy/lang/bn/tokenizer_exceptions.py
@@ -23,4 +23,4 @@ for exc_data in [
     _exc[exc_data[ORTH]] = [dict(exc_data)]
 
 
-TOKENIZER_EXCEPTIONS = dict(_exc)
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py
index 6bf9ab669..e8edf36b8 100644
--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@@ -30,4 +30,4 @@ for orth in [
     _exc[orth] = [{ORTH: orth}]
 
 
-TOKENIZER_EXCEPTIONS = dict(_exc)
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py
index 184d88104..0b23a1001 100644
--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@@ -181,4 +181,4 @@ for orth in [
     _exc[orth] = [{ORTH: orth}]
 
 
-TOKENIZER_EXCEPTIONS = dict(_exc)
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py
index b9fde7882..0e5bbc7f6 100644
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@@ -456,4 +456,4 @@ for string in _exclude:
         _exc.pop(string)
 
 
-TOKENIZER_EXCEPTIONS = dict(_exc)
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py
index 77d9a2841..cb62f008f 100644
--- a/spacy/lang/es/tokenizer_exceptions.py
+++ b/spacy/lang/es/tokenizer_exceptions.py
@@ -54,4 +54,4 @@ for orth in [
     _exc[orth] = [{ORTH: orth}]
 
 
-TOKENIZER_EXCEPTIONS = dict(_exc)
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py
index a5e18bcfa..33e223575 100644
--- a/spacy/lang/fi/tokenizer_exceptions.py
+++ b/spacy/lang/fi/tokenizer_exceptions.py
@@ -76,4 +76,4 @@ for exc_data in [
     _exc[exc_data[ORTH]] = [dict(exc_data)]
 
 
-TOKENIZER_EXCEPTIONS = dict(_exc)
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py
index 5d8c37878..442b367dd 100644
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@@ -147,5 +147,5 @@ _regular_exp += ["^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".f
 _regular_exp.append(URL_PATTERN)
 
 
-TOKENIZER_EXCEPTIONS = dict(_exc)
+TOKENIZER_EXCEPTIONS = _exc
 TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in _regular_exp), re.IGNORECASE).match
diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py
index dd8fdab6c..834c35265 100644
--- a/spacy/lang/hu/tokenizer_exceptions.py
+++ b/spacy/lang/hu/tokenizer_exceptions.py
@@ -95,5 +95,5 @@ _nums = "(({ne})|({t})|({on})|({c}))({s})?".format(
     c=CURRENCY, s=_suffixes)
 
 
-TOKENIZER_EXCEPTIONS = dict(_exc)
+TOKENIZER_EXCEPTIONS = _exc
 TOKEN_MATCH = re.compile("^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match
diff --git a/spacy/lang/id/tokenizer_exceptions.py b/spacy/lang/id/tokenizer_exceptions.py
index 9978606b0..3bba57e4c 100644
--- a/spacy/lang/id/tokenizer_exceptions.py
+++ b/spacy/lang/id/tokenizer_exceptions.py
@@ -46,5 +46,4 @@ for orth in [
     ]:
     _exc[orth] = [{ORTH: orth}]
 
-TOKENIZER_EXCEPTIONS = dict(_exc)
-
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py
index a01c1363c..1529315ca 100644
--- a/spacy/lang/nb/tokenizer_exceptions.py
+++ b/spacy/lang/nb/tokenizer_exceptions.py
@@ -35,4 +35,4 @@ for orth in [
     _exc[orth] = [{ORTH: orth}]
 
 
-TOKENIZER_EXCEPTIONS = dict(_exc)
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py
index 4dffb6209..fb87ae8a6 100644
--- a/spacy/lang/pl/tokenizer_exceptions.py
+++ b/spacy/lang/pl/tokenizer_exceptions.py
@@ -20,4 +20,4 @@ for orth in [
     _exc[orth] = [{ORTH: orth}]
 
 
-TOKENIZER_EXCEPTIONS = dict(_exc)
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/pt/tokenizer_exceptions.py b/spacy/lang/pt/tokenizer_exceptions.py
index 72348fa64..6e8b8a24c 100644
--- a/spacy/lang/pt/tokenizer_exceptions.py
+++ b/spacy/lang/pt/tokenizer_exceptions.py
@@ -72,4 +72,4 @@ for orth in [
     _exc[orth] = [{ORTH: orth}]
 
 
-TOKENIZER_EXCEPTIONS = dict(_exc)
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py
index b7d9834fe..0575c3892 100644
--- a/spacy/lang/sv/tokenizer_exceptions.py
+++ b/spacy/lang/sv/tokenizer_exceptions.py
@@ -80,4 +80,4 @@ for orth in [
     _exc[orth] = [{ORTH: orth}]
 
 
-TOKENIZER_EXCEPTIONS = dict(_exc)
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/th/tag_map.py b/spacy/lang/th/tag_map.py
index 570871820..374900bd9 100644
--- a/spacy/lang/th/tag_map.py
+++ b/spacy/lang/th/tag_map.py
@@ -2,10 +2,10 @@
 # data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1)
 from __future__ import unicode_literals
 
-from ...symbols import *
-
+from ...symbols import POS, NOUN, PRON, ADJ, ADV, INTJ, PROPN, DET, NUM, AUX
+from ...symbols import ADP, CCONJ, PART, PUNCT, SPACE, SCONJ
 TAG_MAP = {
-    #NOUN
+    # NOUN
     "NOUN":     {POS: NOUN},
     "NCMN":     {POS: NOUN},
     "NTTL":     {POS: NOUN},
@@ -14,7 +14,7 @@ TAG_MAP = {
     "CMTR":     {POS: NOUN},
     "CFQC":     {POS: NOUN},
     "CVBL":     {POS: NOUN},
-    #PRON
+    # PRON
     "PRON":     {POS: PRON},
     "NPRP":     {POS: PRON},
     # ADJ
@@ -28,7 +28,7 @@ TAG_MAP = {
     "ADVI":     {POS: ADV},
     "ADVP":     {POS: ADV},
     "ADVS":     {POS: ADV},
-	# INT
+    # INT
     "INT":      {POS: INTJ},
     # PRON
     "PROPN":    {POS: PROPN},
@@ -50,20 +50,20 @@ TAG_MAP = {
     "NCNM":     {POS: NUM},
     "NLBL":     {POS: NUM},
     "DCNM":     {POS: NUM},
-	# AUX
+    # AUX
     "AUX":      {POS: AUX},
     "XVBM":     {POS: AUX},
     "XVAM":     {POS: AUX},
     "XVMM":     {POS: AUX},
     "XVBB":     {POS: AUX},
     "XVAE":     {POS: AUX},
-	# ADP
+    # ADP
     "ADP":      {POS: ADP},
     "RPRE":     {POS: ADP},
     # CCONJ
     "CCONJ":    {POS: CCONJ},
     "JCRG":     {POS: CCONJ},
-	# SCONJ
+    # SCONJ
     "SCONJ":    {POS: SCONJ},
     "PREL":     {POS: SCONJ},
     "JSBR":     {POS: SCONJ},
diff --git a/spacy/lang/th/tokenizer_exceptions.py b/spacy/lang/th/tokenizer_exceptions.py
index c31595893..ee14acf40 100644
--- a/spacy/lang/th/tokenizer_exceptions.py
+++ b/spacy/lang/th/tokenizer_exceptions.py
@@ -1,43 +1,23 @@
 # encoding: utf8
 from __future__ import unicode_literals
 
-from ...symbols import *
+from ...symbols import ORTH, LEMMA
 
-TOKENIZER_EXCEPTIONS = {
-    "ม.ค.": [
-        {ORTH: "ม.ค.", LEMMA: "มกราคม"}
-    ],
-    "ก.พ.": [
-        {ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"}
-    ],
-    "มี.ค.": [
-        {ORTH: "มี.ค.", LEMMA: "มีนาคม"}
-    ],
-    "เม.ย.": [
-        {ORTH: "เม.ย.", LEMMA: "เมษายน"}
-    ],
-    "พ.ค.": [
-        {ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}
-    ],
-    "มิ.ย.": [
-        {ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}
-    ],
-    "ก.ค.": [
-        {ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}
-    ],
-    "ส.ค.": [
-        {ORTH: "ส.ค.", LEMMA: "สิงหาคม"}
-    ],
-    "ก.ย.": [
-        {ORTH: "ก.ย.", LEMMA: "กันยายน"}
-    ],
-    "ต.ค.": [
-        {ORTH: "ต.ค.", LEMMA: "ตุลาคม"}
-    ],
-    "พ.ย.": [
-        {ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}
-    ],
-    "ธ.ค.": [
-        {ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}
-    ]
+
+_exc = {
+    "ม.ค.": [{ORTH: "ม.ค.", LEMMA: "มกราคม"}],
+    "ก.พ.": [{ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"}],
+    "มี.ค.": [{ORTH: "มี.ค.", LEMMA: "มีนาคม"}],
+    "เม.ย.": [{ORTH: "เม.ย.", LEMMA: "เมษายน"}],
+    "พ.ค.": [{ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}],
+    "มิ.ย.": [{ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}],
+    "ก.ค.": [{ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}],
+    "ส.ค.": [{ORTH: "ส.ค.", LEMMA: "สิงหาคม"}],
+    "ก.ย.": [{ORTH: "ก.ย.", LEMMA: "กันยายน"}],
+    "ต.ค.": [{ORTH: "ต.ค.", LEMMA: "ตุลาคม"}],
+    "พ.ย.": [{ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}],
+    "ธ.ค.": [{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}]
 }
+
+
+TOKENIZER_EXCEPTIONS = _exc

From c5799ecc7be46be11d4ed5cdba51d7b48d3cbf5a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 31 Oct 2017 21:12:33 +0100
Subject: [PATCH 54/90] Remove print statement

---
 spacy/vocab.pyx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index ffc81ad0b..14b62a808 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -334,9 +334,7 @@ cdef class Vocab:
             else:
                 width = self.vectors.shape[1]
             self.vectors.resize((new_rows, width))
-            print(self.vectors.shape)
             self.vectors.add(orth, vector=vector)
-        print("Adding", orth, self.vectors.is_full)
         self.vectors.add(orth, vector=vector)
 
     def has_vector(self, orth):

From fe4b10346a4e625ee3d286262b76f5a248e68a24 Mon Sep 17 00:00:00 2001
From: Jim O'Regan <jaoregan@tcd.ie>
Date: Tue, 31 Oct 2017 20:24:53 +0000
Subject: [PATCH 55/90] replace example sentence until I get around to adding a
 punctuation.py

---
 spacy/tests/lang/ga/test_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py
index 9cfbd555e..1c6f68bad 100644
--- a/spacy/tests/lang/ga/test_tokenizer.py
+++ b/spacy/tests/lang/ga/test_tokenizer.py
@@ -5,7 +5,7 @@ import pytest
 
 
 GA_TOKEN_EXCEPTION_TESTS = [
-    ("B'fhearr fanacht as amharc", ["B'", "fhearr", "fanacht", "as", "amharc"]),
+    ('Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).', ['Niall', 'O', 'Domhnaill', ',', 'Rialtas', 'na', 'hÉireann', '1977', '('. 'lch.', '600', ')', '.']),
     ('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise'])
 ]
 

From 92dc12756920906a7afb5a13da3f95f041c0068b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 31 Oct 2017 22:21:55 +0100
Subject: [PATCH 56/90] Fix test for Python 3

---
 spacy/tests/vocab/test_add_vectors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/vocab/test_add_vectors.py b/spacy/tests/vocab/test_add_vectors.py
index 3dcce67cc..3ef599678 100644
--- a/spacy/tests/vocab/test_add_vectors.py
+++ b/spacy/tests/vocab/test_add_vectors.py
@@ -35,6 +35,6 @@ def test_vocab_prune_vectors():
 
     remap = vocab.prune_vectors(2)
     assert list(remap.keys()) == [u'kitten']
-    neighbour, similarity = remap.values()[0]
+    neighbour, similarity = list(remap.values())[0]
     assert neighbour == u'cat', remap
     assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6)

From 25b1d6cd9151b1544e36295bb1a1e171abf268c0 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Tue, 31 Oct 2017 22:36:03 +0100
Subject: [PATCH 57/90] Fix syntax error

---
 spacy/tests/lang/ga/test_tokenizer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py
index 1c6f68bad..cf3caf6de 100644
--- a/spacy/tests/lang/ga/test_tokenizer.py
+++ b/spacy/tests/lang/ga/test_tokenizer.py
@@ -5,7 +5,7 @@ import pytest
 
 
 GA_TOKEN_EXCEPTION_TESTS = [
-    ('Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).', ['Niall', 'O', 'Domhnaill', ',', 'Rialtas', 'na', 'hÉireann', '1977', '('. 'lch.', '600', ')', '.']),
+    ('Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).', ['Niall', 'O', 'Domhnaill', ',', 'Rialtas', 'na', 'hÉireann', '1977', '(', 'lch.', '600', ')', '.']),
     ('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise'])
 ]
 
@@ -15,4 +15,3 @@ def test_tokenizer_handles_exception_cases(ga_tokenizer, text, expected_tokens):
     tokens = ga_tokenizer(text)
     token_list = [token.text for token in tokens if not token.is_space]
     assert expected_tokens == token_list
-

From ba2e6c8c6f7a6ee3d8d4dab66b375a0cb78263b0 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Tue, 31 Oct 2017 23:23:34 +0100
Subject: [PATCH 58/90] Update docstrings and formatting

---
 spacy/vectors.pyx | 45 ++++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 08ab586d1..131a751dc 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -28,7 +28,7 @@ cdef class Vectors:
     instance of numpy.ndarray (for CPU vectors) or cupy.ndarray
     (for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
     rows in the vectors.data table.
-    
+
     Multiple keys can be mapped to the same vector, and not all of the rows in
     the table need to be assigned --- so len(list(vectors.keys())) may be
     greater or smaller than vectors.shape[0].
@@ -39,9 +39,10 @@ cdef class Vectors:
 
     def __init__(self, *, shape=None, data=None, keys=None):
         """Create a new vector store.
-        
+
         shape (tuple): Size of the table, as (# entries, # columns)
         data (numpy.ndarray): The vector data.
+        keys (iterable): A sequence of keys, aligned with the data.
         RETURNS (Vectors): The newly created object.
         """
         if data is None:
@@ -57,7 +58,7 @@ cdef class Vectors:
         if keys is not None:
             for i, key in enumerate(keys):
                 self.add(key, row=i)
-    
+
     @property
     def shape(self):
         """Get `(rows, dims)` tuples of number of rows and number of dimensions
@@ -102,7 +103,7 @@ cdef class Vectors:
         """Set a vector for the given key.
 
         key (int): The key to set the vector for.
-        vector (numpy.ndarray): The vector to set.
+        vector (ndarray): The vector to set.
         """
         i = self.key2row[key]
         self.data[i] = vector
@@ -110,9 +111,9 @@ cdef class Vectors:
             self._unset.remove(i)
 
     def __iter__(self):
-        """Yield vectors from the table.
+        """Iterate over the keys in the table.
 
-        YIELDS (ndarray): A vector.
+        YIELDS (int): A key in the table.
         """
         yield from self.key2row
 
@@ -132,14 +133,14 @@ cdef class Vectors:
         return key in self.key2row
 
     def resize(self, shape, inplace=False):
-        '''Resize the underlying vectors array. If inplace=True, the memory
+        """Resize the underlying vectors array. If inplace=True, the memory
         is reallocated. This may cause other references to the data to become
         invalid, so only use inplace=True if you're sure that's what you want.
 
         If the number of vectors is reduced, keys mapped to rows that have been
         deleted are removed. These removed items are returned as a list of
-        (key, row) tuples.
-        '''
+        `(key, row)` tuples.
+        """
         if inplace:
             self.data.resize(shape, refcheck=False)
         else:
@@ -153,16 +154,22 @@ cdef class Vectors:
                 self.key2row.pop(key)
                 removed_items.append((key, row))
         return removed_items
-    
+
     def keys(self):
-        '''Iterate over the keys in the table.'''
-        yield from self.key2row.keys()
-    
+        """A sequence of the keys in the table.
+
+        RETURNS (iterable): The keys.
+        """
+        return self.key2row.keys()
+
     def values(self):
-        '''Iterate over vectors that have been assigned to at least one key.
+        """Iterate over vectors that have been assigned to at least one key.
 
         Note that some vectors may be unassigned, so the number of vectors
-        returned may be less than the length of the vectors table.'''
+        returned may be less than the length of the vectors table.
+
+        YIELDS (ndarray): A vector in the table.
+        """
         for row, vector in enumerate(range(self.data.shape[0])):
             if row not in self._unset:
                 yield vector
@@ -208,12 +215,12 @@ cdef class Vectors:
             if row in self._unset:
                 self._unset.remove(row)
         return row
-    
+
     def most_similar(self, queries, *, return_scores=False, return_rows=False,
             batch_size=1024):
         '''For each of the given vectors, find the single entry most similar
         to it, by cosine.
-        
+
         Queries are by vector. Results are returned as an array of keys,
         or a tuple of (keys, scores) if return_scores=True. If `queries` is
         large, the calculations are performed in chunks, to avoid consuming
@@ -221,9 +228,9 @@ cdef class Vectors:
         trade-off during the calculations.
         '''
         xp = get_array_module(self.data)
-        
+
         vectors = self.data / xp.linalg.norm(self.data, axis=1, keepdims=True)
-        
+
         best_rows = xp.zeros((queries.shape[0],), dtype='i')
         scores = xp.zeros((queries.shape[0],), dtype='f')
         # Work in batches, to avoid memory problems.

From 00ecfa5417e6ceff9a2ef55ad36ffe475aa2e65b Mon Sep 17 00:00:00 2001
From: Jim O'Regan <jaoregan@tcd.ie>
Date: Tue, 31 Oct 2017 22:54:42 +0000
Subject: [PATCH 59/90] =?UTF-8?q?=C3=93,=20not=20O?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 spacy/tests/lang/ga/test_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py
index 1c6f68bad..8aa917a6f 100644
--- a/spacy/tests/lang/ga/test_tokenizer.py
+++ b/spacy/tests/lang/ga/test_tokenizer.py
@@ -5,7 +5,7 @@ import pytest
 
 
 GA_TOKEN_EXCEPTION_TESTS = [
-    ('Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).', ['Niall', 'O', 'Domhnaill', ',', 'Rialtas', 'na', 'hÉireann', '1977', '('. 'lch.', '600', ')', '.']),
+    ('Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).', ['Niall', 'Ó', 'Domhnaill', ',', 'Rialtas', 'na', 'hÉireann', '1977', '('. 'lch.', '600', ')', '.']),
     ('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise'])
 ]
 

From 2ad2f09d1232048ccd85d3eacd58c4fc50bd6194 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 00:18:08 +0100
Subject: [PATCH 60/90] Update docstrings and simplify most_similar

---
 spacy/vectors.pyx | 47 ++++++++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 131a751dc..96ccff518 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -70,17 +70,18 @@ cdef class Vectors:
 
     @property
     def size(self):
-        """Return rows*dims"""
+        """RETURNS (int): rows*dims"""
         return self.data.shape[0] * self.data.shape[1]
 
     @property
     def is_full(self):
-        """Returns True if no keys are available for new keys."""
+        """RETURNS (bool): `True` if no slots are available for new keys."""
         return len(self._unset) == 0
 
     @property
     def n_keys(self):
-        """Returns True if no keys are available for new keys."""
+        """RETURNS (int) The number of keys in the table. Note that this is the
+        number of all keys, not just unique vectors."""
         return len(self.key2row)
 
     def __reduce__(self):
@@ -198,9 +199,10 @@ cdef class Vectors:
         """Add a key to the table. Keys can be mapped to an existing vector
         by setting `row`, or a new vector can be added.
 
-        key (unicode / int): The key to add.
-        vector (numpy.ndarray / None): A vector to add for the key.
-        row (int / None): The row-number of a vector to map the key to.
+        key (int): The key to add.
+        vector (ndarray / None): A vector to add for the key.
+        row (int / None): The row number of a vector to map the key to.
+        RETURNS (int): The row the vector was added to.
         """
         if row is None and key in self.key2row:
             row = self.key2row[key]
@@ -216,17 +218,20 @@ cdef class Vectors:
                 self._unset.remove(row)
         return row
 
-    def most_similar(self, queries, *, return_scores=False, return_rows=False,
-            batch_size=1024):
-        '''For each of the given vectors, find the single entry most similar
+    def most_similar(self, queries, *, batch_size=1024):
+        """For each of the given vectors, find the single entry most similar
         to it, by cosine.
 
-        Queries are by vector. Results are returned as an array of keys,
-        or a tuple of (keys, scores) if return_scores=True. If `queries` is
-        large, the calculations are performed in chunks, to avoid consuming
-        too much memory. You can set the `batch_size` to control the size/space
-        trade-off during the calculations.
-        '''
+        Queries are by vector. Results are returned as a `(keys, best_rows,
+        scores)` tuple. If `queries` is large, the calculations are performed in
+        chunks, to avoid consuming too much memory. You can set the `batch_size`
+        to control the size/space trade-off during the calculations.
+
+        queries (ndarray): An array with one or more vectors.
+        batch_size (int): The batch size to use.
+        RETURNS (tuple): The most similar entry as a `(keys, best_rows, scores)`
+            tuple.
+        """
         xp = get_array_module(self.data)
 
         vectors = self.data / xp.linalg.norm(self.data, axis=1, keepdims=True)
@@ -244,14 +249,7 @@ cdef class Vectors:
             best_rows[i:i+batch_size] = sims.argmax(axis=1)
             scores[i:i+batch_size] = sims.max(axis=1)
         keys = self.get_keys(best_rows)
-        if return_rows and return_scores:
-            return (keys, best_rows, scores)
-        elif return_rows:
-            return (keys, best_rows)
-        elif return_scores:
-            return (keys, scores)
-        else:
-            return keys
+        return (keys, best_rows, scores)
 
     def from_glove(self, path):
         """Load GloVe vectors from a directory. Assumes binary format,
@@ -261,8 +259,7 @@ cdef class Vectors:
         By default GloVe outputs 64-bit vectors.
 
         path (unicode / Path): The path to load the GloVe vectors from.
-
-        RETURNS: A StringStore object, holding the key-to-string mapping.
+        RETURNS: A `StringStore` object, holding the key-to-string mapping.
         """
         path = util.ensure_path(path)
         width = None

From c16310d15673b781a8cfb65c43f862e7fd02a1dd Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 1 Nov 2017 00:34:55 +0100
Subject: [PATCH 61/90] Update vectors with find method

---
 spacy/vectors.pyx | 49 ++++++++++++++++++++++++++++++++++++-----------
 spacy/vocab.pyx   |  5 ++---
 2 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 96ccff518..95378947a 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -10,7 +10,7 @@ cimport numpy as np
 from thinc.neural.util import get_array_module
 from thinc.neural._classes.model import Model
 
-from .strings cimport StringStore
+from .strings cimport StringStore, hash_string
 from .compat import basestring_, path2str
 from . import util
 
@@ -183,17 +183,42 @@ cdef class Vectors:
         for key, row in self.key2row.items():
             yield key, self.data[row]
 
-    def get_keys(self, rows):
-        xp = get_array_module(self.data)
-        row2key = {row: key for key, row in self.key2row.items()}
-        keys = xp.asarray([row2key[row] for row in rows],
-                           dtype='uint64')
-        return keys
+    def find(self, *, key=None, keys=None, row=None, rows=None):
+        '''Lookup one or more keys by row, or vice versa.
 
-    def get_rows(self, keys):
+        key (unicode / int): Find the row that the given key points to.
+            Returns int, -1 if missing.
+        keys (sequence): Find rows that the keys point to.
+            Returns ndarray.
+        row (int): Find the first key that point to the row.
+            Returns int.
+        rows (sequence): Find the first keys that points to the rows.
+            Returns ndarray.
+        '''
+        if sum(arg is None for arg in (key, keys, row, rows)) != 3:
+            raise ValueError("One (and only one) keyword arg must be set.")
         xp = get_array_module(self.data)
-        k2r = self.key2row
-        return xp.asarray([k2r.get(key, -1) for key in keys], dtype='i')
+        if key is not None:
+            if isinstance(key, basestring_):
+                key = hash_string(key)
+            return self.key2row.get(key, -1)
+        elif keys is not None:
+            keys = [hash_string(key) if isinstance(key, basestring_) else key
+                    for key in keys]
+            rows = [self.key2row.get(key, -1.) for key in keys]
+            return xp.asarray(rows, dtype='i')
+        else:
+            targets = set()
+            if row is not None:
+                targets.add(row)
+            else:
+                targets.update(rows)
+            results = []
+            for key, row in self.key2row.items():
+                if row in targets:
+                    results.append(key)
+                    targets.remove(row)
+            return xp.asarray(results, dtype='uint64')
 
     def add(self, key, *, vector=None, row=None):
         """Add a key to the table. Keys can be mapped to an existing vector
@@ -204,6 +229,8 @@ cdef class Vectors:
         row (int / None): The row number of a vector to map the key to.
         RETURNS (int): The row the vector was added to.
         """
+        if isinstance(key, basestring):
+            key = hash_string(key)
         if row is None and key in self.key2row:
             row = self.key2row[key]
         elif row is None:
@@ -248,7 +275,7 @@ cdef class Vectors:
             sims = xp.dot(batch, vectors.T)
             best_rows[i:i+batch_size] = sims.argmax(axis=1)
             scores[i:i+batch_size] = sims.max(axis=1)
-        keys = self.get_keys(best_rows)
+        keys = self.find(rows=best_rows)
         return (keys, best_rows, scores)
 
     def from_glove(self, path):
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 14b62a808..675e4a805 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -286,14 +286,13 @@ cdef class Vocab:
         priority.sort()
         indices = xp.asarray([i for (prob, i, key) in priority], dtype='i')
         keys = xp.asarray([key for (prob, i, key) in priority], dtype='uint64')
-        
+
         keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
         toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
 
         self.vectors = Vectors(data=keep, keys=keys)
 
-        syn_keys, syn_rows, scores = self.vectors.most_similar(toss,
-                                        return_rows=True, return_scores=True)
+        syn_keys, syn_rows, scores = self.vectors.most_similar(toss)
 
         remap = {}
         for i, key in enumerate(keys[nr_row:]):

From 5683fd65ed32e7c7fc3f15ecaad619591ee10f3c Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 00:42:39 +0100
Subject: [PATCH 62/90] Update docstrings

---
 spacy/vectors.pyx | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 95378947a..b1d17a026 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -184,17 +184,18 @@ cdef class Vectors:
             yield key, self.data[row]
 
     def find(self, *, key=None, keys=None, row=None, rows=None):
-        '''Lookup one or more keys by row, or vice versa.
+        """Look up one or more keys by row, or vice versa.
 
         key (unicode / int): Find the row that the given key points to.
             Returns int, -1 if missing.
-        keys (sequence): Find rows that the keys point to.
+        keys (iterable): Find rows that the keys point to.
             Returns ndarray.
         row (int): Find the first key that point to the row.
             Returns int.
-        rows (sequence): Find the first keys that points to the rows.
+        rows (iterable): Find the keys that point to the rows.
             Returns ndarray.
-        '''
+        RETURNS: The requested key, keys, row or rows.
+        """
         if sum(arg is None for arg in (key, keys, row, rows)) != 3:
             raise ValueError("One (and only one) keyword arg must be set.")
         xp = get_array_module(self.data)

From 4b196fdf7f0fced193a70507e40b776cf4fd5439 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 00:43:22 +0100
Subject: [PATCH 63/90] Fix formatting

---
 examples/information_extraction/entity_relations.py |  3 +--
 examples/information_extraction/parse_subtrees.py   |  5 ++---
 examples/information_extraction/phrase_matcher.py   |  3 ++-
 examples/pipeline/multi_processing.py               |  5 +++--
 examples/training/train_ner.py                      |  3 +--
 examples/training/train_new_entity_type.py          |  3 +--
 examples/training/train_parser.py                   |  7 ++-----
 examples/training/train_tagger.py                   |  5 ++---
 examples/training/train_textcat.py                  |  5 ++---
 examples/vectors_fast_text.py                       | 11 ++++++-----
 10 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/examples/information_extraction/entity_relations.py b/examples/information_extraction/entity_relations.py
index b73dcbf3b..47b20057c 100644
--- a/examples/information_extraction/entity_relations.py
+++ b/examples/information_extraction/entity_relations.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python
 # coding: utf8
-"""
-A simple example of extracting relations between phrases and entities using
+"""A simple example of extracting relations between phrases and entities using
 spaCy's named entity recognizer and the dependency parse. Here, we extract
 money and currency values (entities labelled as MONEY) and then check the
 dependency tree to find the noun phrase they are referring to – for example:
diff --git a/examples/information_extraction/parse_subtrees.py b/examples/information_extraction/parse_subtrees.py
index 5963d014c..2a258b31d 100644
--- a/examples/information_extraction/parse_subtrees.py
+++ b/examples/information_extraction/parse_subtrees.py
@@ -1,8 +1,7 @@
 #!/usr/bin/env python
 # coding: utf8
-"""
-This example shows how to navigate the parse tree including subtrees attached
-to a word.
+"""This example shows how to navigate the parse tree including subtrees
+attached to a word.
 
 Based on issue #252:
 "In the documents and tutorials the main thing I haven't found is
diff --git a/examples/information_extraction/phrase_matcher.py b/examples/information_extraction/phrase_matcher.py
index 2dd2691b9..0b5bcdc7f 100644
--- a/examples/information_extraction/phrase_matcher.py
+++ b/examples/information_extraction/phrase_matcher.py
@@ -1,9 +1,10 @@
+#!/usr/bin/env python
+# coding: utf8
 """Match a large set of multi-word expressions in O(1) time.
 
 The idea is to associate each word in the vocabulary with a tag, noting whether
 they begin, end, or are inside at least one pattern. An additional tag is used
 for single-word patterns. Complete patterns are also stored in a hash set.
-
 When we process a document, we look up the words in the vocabulary, to
 associate the words with the tags.  We then search for tag-sequences that
 correspond to valid candidates. Finally, we look up the candidates in the hash
diff --git a/examples/pipeline/multi_processing.py b/examples/pipeline/multi_processing.py
index 19b1c462a..99bb9c53f 100644
--- a/examples/pipeline/multi_processing.py
+++ b/examples/pipeline/multi_processing.py
@@ -1,5 +1,6 @@
-"""
-Example of multi-processing with Joblib. Here, we're exporting
+#!/usr/bin/env python
+# coding: utf8
+"""Example of multi-processing with Joblib. Here, we're exporting
 part-of-speech-tagged, true-cased, (very roughly) sentence-separated text, with
 each "sentence" on a newline, and spaces between tokens. Data is loaded from
 the IMDB movie reviews dataset and will be loaded automatically via Thinc's
diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py
index 499807d23..e95cce4c9 100644
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python
 # coding: utf8
-"""
-Example of training spaCy's named entity recognizer, starting off with an
+"""Example of training spaCy's named entity recognizer, starting off with an
 existing model or a blank model.
 
 For more details, see the documentation:
diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py
index ec1e562c6..1c70f7c03 100644
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python
 # coding: utf8
-"""
-Example of training an additional entity type
+"""Example of training an additional entity type
 
 This script shows how to add a new entity type to an existing pre-trained NER
 model. To keep the example short and simple, only four sentences are provided
diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py
index a23d73ec7..e321fdb1e 100644
--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@@ -1,10 +1,7 @@
 #!/usr/bin/env python
 # coding: utf8
-"""
-Example of training spaCy dependency parser, starting off with an existing model
-or a blank model.
-
-For more details, see the documentation:
+"""Example of training spaCy dependency parser, starting off with an existing
+model or a blank model. For more details, see the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * Dependency Parse: https://alpha.spacy.io/usage/linguistic-features#dependency-parse
 
diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py
index c6fc1de88..7508c2e66 100644
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@@ -3,9 +3,8 @@
 """
 A simple example for training a part-of-speech tagger with a custom tag map.
 To allow us to update the tag map with our custom one, this example starts off
-with a blank Language class and modifies its defaults.
-
-For more details, see the documentation:
+with a blank Language class and modifies its defaults. For more details, see
+the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging
 
diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py
index 1f9cd29aa..fc9610a66 100644
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@@ -3,9 +3,8 @@
 """Train a multi-label convolutional neural network text classifier on the
 IMDB dataset, using the TextCategorizer component. The dataset will be loaded
 automatically via Thinc's built-in dataset loader. The model is added to
-spacy.pipeline, and predictions are available via `doc.cats`.
-
-For more details, see the documentation:
+spacy.pipeline, and predictions are available via `doc.cats`. For more details,
+see the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * Text classification: https://alpha.spacy.io/usage/text-classification
 
diff --git a/examples/vectors_fast_text.py b/examples/vectors_fast_text.py
index 159250098..d14f6724f 100644
--- a/examples/vectors_fast_text.py
+++ b/examples/vectors_fast_text.py
@@ -13,8 +13,7 @@ import from spacy.language import Language
 @plac.annotations(
     vectors_loc=("Path to vectors", "positional", None, str))
 def main(vectors_loc):
-    nlp = Language()
-
+    nlp = Language()  # start off with a blank Language class
     with open(vectors_loc, 'rb') as file_:
         header = file_.readline()
         nr_row, nr_dim = header.split()
@@ -24,9 +23,11 @@ def main(vectors_loc):
             pieces = line.split()
             word = pieces[0]
             vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
-            nlp.vocab.set_vector(word, vector)
-    doc = nlp(u'class colspan')
-    print(doc[0].similarity(doc[1]))
+            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
+    # test the vectors and similarity
+    text = 'class colspan'
+    doc = nlp(text)
+    print(text, doc[0].similarity(doc[1]))
 
 
 if __name__ == '__main__':

From 0ca152a01541f306ca23d5c41b0f56a1dafef260 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 00:43:28 +0100
Subject: [PATCH 64/90] Fix syntax error

---
 examples/vectors_fast_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/vectors_fast_text.py b/examples/vectors_fast_text.py
index d14f6724f..5b763fe0a 100644
--- a/examples/vectors_fast_text.py
+++ b/examples/vectors_fast_text.py
@@ -7,7 +7,7 @@ from __future__ import unicode_literals
 import plac
 import numpy
 
-import from spacy.language import Language
+from spacy.language import Language
 
 
 @plac.annotations(

From 86eba61fae69827df0571eddcf754a2954c81358 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 1 Nov 2017 00:47:35 +0100
Subject: [PATCH 65/90] Fix token.vector when vectors are missing

---
 spacy/tests/spans/test_span.py | 3 +--
 spacy/tokens/token.pyx         | 5 +----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py
index 4050809b5..b8638ba4b 100644
--- a/spacy/tests/spans/test_span.py
+++ b/spacy/tests/spans/test_span.py
@@ -118,8 +118,7 @@ def test_span_to_array(doc):
     assert arr[0, 1] == len(span[0])
 
 
-@pytest.mark.xfail
 def test_span_as_doc(doc):
     span = doc[4:10]
     span_doc = span.as_doc()
-    assert span.text == span_doc.text
+    assert span.text == span_doc.text.strip()
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index fa07d0e9e..c3d92b389 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -302,10 +302,7 @@ cdef class Token:
         def __get__(self):
             if 'vector' in self.doc.user_token_hooks:
                 return self.doc.user_token_hooks['vector'](self)
-            if self.has_vector:
-                return self.vocab.get_vector(self.c.lex.orth)
-            else:
-                return self.doc.tensor[self.i]
+            return self.vocab.get_vector(self.c.lex.orth)
 
     property vector_norm:
         """The L2 norm of the token's vector representation.

From 0cde065ed9406e40d301b6234ac1b2cb6a7fd130 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 00:56:21 +0100
Subject: [PATCH 66/90] Add Irish to list of languages (see #1152)

---
 website/models/_data.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/website/models/_data.json b/website/models/_data.json
index 959d73133..1120b0fcc 100644
--- a/website/models/_data.json
+++ b/website/models/_data.json
@@ -100,6 +100,7 @@
         "hu": "Hungarian",
         "pl": "Polish",
         "he": "Hebrew",
+        "ga": "Irish",
         "bn": "Bengali",
         "hi": "Hindi",
         "id": "Indonesian",

From 9eb998443fe1beefdf65914e6df4359e67207b67 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 00:56:35 +0100
Subject: [PATCH 67/90] Update language tokenizer dependencies

---
 website/usage/_models/_languages.jade | 1 -
 1 file changed, 1 deletion(-)

diff --git a/website/usage/_models/_languages.jade b/website/usage/_models/_languages.jade
index 4337b5b99..7163d8448 100644
--- a/website/usage/_models/_languages.jade
+++ b/website/usage/_models/_languages.jade
@@ -46,7 +46,6 @@ p
         +item #[strong Chinese]: #[+a("https://github.com/fxsjy/jieba") Jieba]
         +item #[strong Japanese]: #[+a("https://github.com/mocobeta/janome") Janome]
         +item #[strong Thai]: #[+a("https://github.com/wannaphongcom/pythainlp") pythainlp]
-        +item #[strong Russian]: #[+a("https://github.com/kmike/pymorphy2") pymorphy2]
 
 +h(3, "multi-language") Multi-language support
     +tag-new(2)

From 0d8f4a534b8966083c7e6e938f5b5cfec39ada65 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 00:56:54 +0100
Subject: [PATCH 68/90] Update Vectors API docs

---
 website/api/vectors.jade | 243 +++++++++++++++++++++++++++------------
 1 file changed, 172 insertions(+), 71 deletions(-)

diff --git a/website/api/vectors.jade b/website/api/vectors.jade
index 692bd1ca8..9685188c5 100644
--- a/website/api/vectors.jade
+++ b/website/api/vectors.jade
@@ -5,46 +5,47 @@ include ../_includes/_mixins
 p
     |  Vectors data is kept in the #[code Vectors.data] attribute, which should
     |  be an instance of #[code numpy.ndarray] (for CPU vectors) or
-    |  #[code cupy.ndarray] (for GPU vectors).
+    |  #[code cupy.ndarray] (for GPU vectors). Multiple keys can be mapped to
+    |  the same vector, and not all of the rows in the table need to be
+    |  assigned – so #[code vectors.n_keys] may be greater or smaller than
+    |  #[code vectors.shape[0]].
 
 +h(2, "init") Vectors.__init__
     +tag method
 
 p
-    |  Create a new vector store. To keep the vector table empty, pass
-    |  #[code width=0]. You can also create the vector table and add
-    |  vectors one by one, or set the vector values directly on initialisation.
+    |  Create a new vector store. You can set the vector values and keys
+    |  directly on initialisation, or supply a #[code shape] keyword argument
+    |  to create an empty table you can add vectors to later.
 
 +aside-code("Example").
     from spacy.vectors import Vectors
-    from spacy.strings import StringStore
 
-    empty_vectors = Vectors(StringStore())
+    empty_vectors = Vectors(shape=(10000, 300))
 
-    vectors = Vectors([u'cat'], width=300)
-    vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,))
-
-    vector_table = numpy.zeros((3, 300), dtype='f')
-    vectors = Vectors(StringStore(), data=vector_table)
+    data = numpy.zeros((3, 300), dtype='f')
+    keys = [u'cat', u'dog', u'rat']
+    vectors = Vectors(data=data, keys=keys)
 
 +table(["Name", "Type", "Description"])
-    +row
-        +cell #[code strings]
-        +cell #[code StringStore] or list
-        +cell
-            |  List of strings, or a #[+api("stringstore") #[code StringStore]]
-            |  that maps strings to hash values, and vice versa.
-
-    +row
-        +cell #[code width]
-        +cell int
-        +cell Number of dimensions.
-
     +row
         +cell #[code data]
-        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break ndarray[ndim=1, dtype='float32']]
         +cell The vector data.
 
+    +row
+        +cell #[code keys]
+        +cell iterable
+        +cell A sequence of keys aligned with the data.
+
+    +row
+        +cell #[code shape]
+        +cell tuple
+        +cell
+            |  Size of the table as #[code (n_entries, n_columns)], the number
+            |  of entries and number of columns. Not required if you're
+            |  initialising the object with #[code data] and #[code keys].
+
     +row("foot")
         +cell returns
         +cell #[code Vectors]
@@ -54,97 +55,92 @@ p
     +tag method
 
 p
-    |  Get a vector by key. If key is a string, it is hashed to an integer ID
-    |  using the #[code Vectors.strings] table. If the integer key is not found
-    |  in the table, a #[code KeyError] is raised.
+    |  Get a vector by key. If the key is not found in the table, a
+    |  #[code KeyError] is raised.
 
 +aside-code("Example").
-    vectors = Vectors(StringStore(), 300)
-    vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,)))
-    cat_vector = vectors[u'cat']
+    cat_id = nlp.vocab.strings[u'cat']
+    cat_vector = nlp.vocab.vectors[cat_id]
+    assert cat_vector == nlp.vocab[u'cat'].vector
 
 +table(["Name", "Type", "Description"])
     +row
         +cell #[code key]
-        +cell unicode / int
+        +cell int
         +cell The key to get the vector for.
 
     +row
         +cell returns
-        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break ndarray[ndim=1, dtype='float32']]
         +cell The vector for the key.
 
 +h(2, "setitem") Vectors.__setitem__
     +tag method
 
 p
-    |  Set a vector for the given key. If key is a string, it is hashed to an
-    |  integer ID using the #[code Vectors.strings] table.
+    |  Set a vector for the given key.
 
 +aside-code("Example").
-    vectors = Vectors(StringStore(), 300)
-    vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,))
+    cat_id = nlp.vocab.strings[u'cat']
+    vector = numpy.random.uniform(-1, 1, (300,))
+    nlp.vocab.vectors[cat_id] = vector
 
 +table(["Name", "Type", "Description"])
     +row
         +cell #[code key]
-        +cell unicode / int
+        +cell int
         +cell The key to set the vector for.
 
     +row
         +cell #[code vector]
-        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break ndarray[ndim=1, dtype='float32']]
         +cell The vector to set.
 
 +h(2, "iter") Vectors.__iter__
     +tag method
 
-p Yield vectors from the table.
+p Iterate over the keys in the table.
 
 +aside-code("Example").
-    vector_table = numpy.zeros((3, 300), dtype='f')
-    vectors = Vectors(StringStore(), vector_table)
-    for vector in vectors:
-        print(vector)
+    for key in nlp.vocab.vectors:
+        print(key, nlp.vocab.strings[key])
 
 +table(["Name", "Type", "Description"])
     +row("foot")
         +cell yields
-        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
-        +cell A vector from the table.
+        +cell int
+        +cell A key in the table.
 
 +h(2, "len") Vectors.__len__
     +tag method
 
-p Return the number of vectors that have been assigned.
+p Return the number of vectors in the table.
 
 +aside-code("Example").
-    vector_table = numpy.zeros((3, 300), dtype='f')
-    vectors = Vectors(StringStore(), vector_table)
+    vectors = Vectors(shape=(3, 300))
     assert len(vectors) == 3
 
 +table(["Name", "Type", "Description"])
     +row("foot")
         +cell returns
         +cell int
-        +cell The number of vectors in the data.
+        +cell The number of vectors in the table.
 
 +h(2, "contains") Vectors.__contains__
     +tag method
 
 p
-    |  Check whether a key has a vector entry in the table. If key is a string,
-    |  it is hashed to an integer ID using the #[code Vectors.strings] table.
+    |  Check whether a key has been mapped to a vector entry in the table.
 
 +aside-code("Example").
-    vectors = Vectors(StringStore(), 300)
-    vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,)))
-    assert u'cat' in vectors
+    cat_id = nlp.vocab.strings[u'cat']
+    nlp.vectors.add(cat_id, numpy.random.uniform(-1, 1, (300,)))
+    assert cat_id in vectors
 
 +table(["Name", "Type", "Description"])
     +row
         +cell #[code key]
-        +cell unicode / int
+        +cell int
         +cell The key to check.
 
     +row("foot")
@@ -156,13 +152,20 @@ p
     +tag method
 
 p
-    |  Add a key to the table, optionally setting a vector value as well. If
-    |  key is a string, it is hashed to an integer ID using the
-    |  #[code Vectors.strings] table.
+    |  Add a key to the table, optionally setting a vector value as well. Keys
+    |  can be mapped to an existing vector by setting #[code row], or a new
+    |  vector can be added. When adding unicode keys, keep in mind that the
+    |  #[code Vectors] class itself has no
+    |  #[+api("stringstore") #[code StringStore]], so you have to store the
+    |  hash-to-string mapping separately. If you need to manage the strings,
+    |  you should use the #[code Vectors] via the
+    |  #[+api("vocab") #[code Vocab]] class, e.g. #[code vocab.vectors].
 
 +aside-code("Example").
-    vectors = Vectors(StringStore(), 300)
-    vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,)))
+    vector = numpy.random.uniform(-1, 1, (300,))
+    cat_id = nlp.vocab.strings[u'cat']
+    nlp.vocab.vectors.add(cat_id, vector=vector)
+    nlp.vocab.vectors.add(u'dog', row=0)
 
 +table(["Name", "Type", "Description"])
     +row
@@ -172,25 +175,66 @@ p
 
     +row
         +cell #[code vector]
-        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
-        +cell An optional vector to add.
+        +cell #[code.u-break ndarray[ndim=1, dtype='float32']]
+        +cell An optional vector to add for the key.
+
+    +row
+        +cell #[code row]
+        +cell int
+        +cell An optional row number of a vector to map the key to.
+
+    +row("foot")
+        +cell returns
+        +cell int
+        +cell The row the vector was added to.
+
++h(2, "keys") Vectors.keys
+    +tag method
+
+p A sequence of the keys in the table.
+
++aside-code("Example").
+    for key in nlp.vocab.vectors.keys():
+        print(key, nlp.vocab.strings[key])
+
++table(["Name", "Type", "Description"])
+    +row("foot")
+        +cell returns
+        +cell iterable
+        +cell The keys.
+
++h(2, "values") Vectors.values
+    +tag method
+
+p
+    |  Iterate over vectors that have been assigned to at least one key. Note
+    |  that some vectors may be unassigned, so the number of vectors returned
+    |  may be less than the length of the vectors table.
+
++aside-code("Example").
+    for vector in nlp.vocab.vectors.values():
+        print(vector)
+
++table(["Name", "Type", "Description"])
+    +row("foot")
+        +cell yields
+        +cell #[code.u-break ndarray[ndim=1, dtype='float32']]
+        +cell A vector in the table.
 
 +h(2, "items") Vectors.items
     +tag method
 
-p Iterate over #[code (string key, vector)] pairs, in order.
+p Iterate over #[code (key, vector)] pairs, in order.
 
 +aside-code("Example").
-    vectors = Vectors(StringStore(), 300)
-    vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,)))
-    for key, vector in vectors.items():
-        print(key, vector)
+    for key, vector in nlp.vocab.vectors.items():
+        print(key, nlp.vocab.strings[key], vector)
 
 +table(["Name", "Type", "Description"])
     +row("foot")
         +cell yields
         +cell tuple
-        +cell #[code (string key, vector)] pairs, in order.
+        +cell #[code (key, vector)] pairs, in order.
 
 +h(2, "shape") Vectors.shape
     +tag property
@@ -200,7 +244,7 @@ p
     |  dimensions in the vector table.
 
 +aside-code("Example").
-    vectors = Vectors(StringStore(), 300)
+    vectors = Vectors(shape(1, 300))
     vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,)))
     rows, dims = vectors.shape
     assert rows == 1
@@ -212,6 +256,59 @@ p
         +cell tuple
         +cell A #[code (rows, dims)] pair.
 
++h(2, "size") Vectors.size
+    +tag property
+
+p The vector size, i.e. #[code rows * dims].
+
++aside-code("Example").
+    vectors = Vectors(shape=(500, 300))
+    assert vectors.size == 150000
+
++table(["Name", "Type", "Description"])
+    +row("foot")
+        +cell returns
+        +cell int
+        +cell The vector size.
+
++h(2, "is_full") Vectors.is_full
+    +tag property
+
+p
+    |  Whether the vectors table is full and has no slots are available for new
+    |  keys. If a table is full, it can be resized using
+    |  #[+api("vectors#resize") #[code Vectors.resize]].
+
++aside-code("Example").
+    vectors = Vectors(shape=(1, 300))
+    vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,)))
+    assert vectors.is_full
+
++table(["Name", "Type", "Description"])
+    +row("foot")
+        +cell returns
+        +cell bool
+        +cell Whether the vectors table is full.
+
++h(2, "n_keys") Vectors.n_keys
+    +tag property
+
+p
+    |  Get the number of keys in the table. Note that this is the number of
+    |  #[em all] keys, not just unique vectors. If several keys are mapped
+    |  are mapped to the same vectors, they will be counted individually.
+
++aside-code("Example").
+    vectors = Vectors(shape=(10, 300))
+    assert len(vectors) == 10
+    assert vectors.n_keys == 0
+
++table(["Name", "Type", "Description"])
+    +row("foot")
+        +cell returns
+        +cell int
+        +cell The number of all keys in the table.
+
 +h(2, "from_glove") Vectors.from_glove
     +tag method
 
@@ -223,6 +320,10 @@ p
     |  float32 vectors, #[code vectors.300.d.bin] for 300d float64 (double)
     |  vectors, etc. By default GloVe outputs 64-bit vectors.
 
++aside-code("Example").
+    vectors = Vectors()
+    vectors.from_glove('/path/to/glove_vectors')
+
 +table(["Name", "Type", "Description"])
     +row
         +cell #[code path]
@@ -323,7 +424,7 @@ p Load state from a binary string.
 +table(["Name", "Type", "Description"])
     +row
         +cell #[code data]
-        +cell #[code numpy.ndarray] / #[code cupy.ndarray]
+        +cell #[code.u-break ndarray[ndim=1, dtype='float32']]
         +cell
             |  Stored vectors data. #[code numpy] is used for CPU vectors,
             |  #[code cupy] for GPU vectors.
@@ -337,7 +438,7 @@ p Load state from a binary string.
 
     +row
         +cell #[code keys]
-        +cell #[code numpy.ndarray]
+        +cell #[code.u-break ndarray[ndim=1, dtype='float32']]
         +cell
             |  Array keeping the keys in order, such that
             |  #[code keys[vectors.key2row[key]] == key]

From 37e62ab0e21757751b9606cefbc9f1deec8f2300 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 01:25:09 +0100
Subject: [PATCH 69/90] Update vector meta in meta.json

---
 spacy/cli/package.py | 3 ++-
 spacy/cli/train.py   | 3 ++-
 spacy/language.py    | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 3157ba99d..5ee8a2b1e 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -99,7 +99,8 @@ def generate_meta(model_path, existing_meta):
     nlp = util.load_model_from_path(Path(model_path))
     meta['pipeline'] = nlp.pipe_names
     meta['vectors'] = {'width': nlp.vocab.vectors_length,
-                       'entries': len(nlp.vocab.vectors)}
+                       'vectors': len(nlp.vocab.vectors),
+                       'keys': nlp.vocab.vectors.n_keys}
     prints("Enter the package settings for your model. The following "
            "information will be read from your model data: pipeline, vectors.",
            title="Generating meta.json")
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 74e1d6d68..f489ba7bf 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -146,7 +146,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
                 meta['speed'] = {'nwords': nwords, 'cpu': cpu_wps,
                                  'gpu': gpu_wps}
                 meta['vectors'] = {'width': nlp.vocab.vectors_length,
-                                   'entries': len(nlp.vocab.vectors)}
+                                   'vectors': len(nlp.vocab.vectors),
+                                   'keys': nlp.vocab.vectors.n_keys}
                 meta['lang'] = nlp.lang
                 meta['pipeline'] = pipeline
                 meta['spacy_version'] = '>=%s' % about.__version__
diff --git a/spacy/language.py b/spacy/language.py
index 1ce74b265..806172f36 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -155,7 +155,8 @@ class Language(object):
         self._meta.setdefault('url', '')
         self._meta.setdefault('license', '')
         self._meta['vectors'] = {'width': self.vocab.vectors_length,
-                                 'entries': len(self.vocab.vectors)}
+                                 'vectors': len(self.vocab.vectors),
+                                 'keys': self.vocab.vectors.n_keys}
         self._meta['pipeline'] = self.pipe_names
         return self._meta
 

From 07d02c33040b250abf0ddc43b95a638c12ab4b54 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 01:25:17 +0100
Subject: [PATCH 70/90] Update vectors and similarity usage guide

---
 website/usage/_data.json                      |   1 -
 .../usage/_vectors-similarity/_basics.jade    | 124 +++++++++++++
 .../usage/_vectors-similarity/_custom.jade    | 167 ++++++++++++++----
 .../_vectors-similarity/_in-context.jade      | 123 -------------
 website/usage/vectors-similarity.jade         |   4 -
 5 files changed, 260 insertions(+), 159 deletions(-)
 delete mode 100644 website/usage/_vectors-similarity/_in-context.jade

diff --git a/website/usage/_data.json b/website/usage/_data.json
index 4a4e6df01..498202695 100644
--- a/website/usage/_data.json
+++ b/website/usage/_data.json
@@ -116,7 +116,6 @@
         "next": "text-classification",
         "menu": {
             "Basics": "basics",
-            "Similarity in Context": "in-context",
             "Custom Vectors": "custom",
             "GPU Usage": "gpu"
         }
diff --git a/website/usage/_vectors-similarity/_basics.jade b/website/usage/_vectors-similarity/_basics.jade
index b8f8d834c..300680331 100644
--- a/website/usage/_vectors-similarity/_basics.jade
+++ b/website/usage/_vectors-similarity/_basics.jade
@@ -13,3 +13,127 @@
 
 include ../_spacy-101/_similarity
 include ../_spacy-101/_word-vectors
+
++h(3, "in-context") Similarities in context
+
+p
+    |  Aside from spaCy's built-in word vectors, which were trained on a lot of
+    |  text with a wide vocabulary, the parsing, tagging and NER models also
+    |  rely on vector representations of the #[strong meanings of words in context].
+    |  As the first component of the
+    |  #[+a("/usage/processing-pipelines") processing pipeline], the
+    |  tensorizer encodes a document's internal meaning representations as an
+    |  array of floats, also called a tensor. This allows spaCy to make a
+    |  reasonable guess at a word's meaning, based on its surrounding words.
+    |  Even if a word hasn't been seen before, spaCy will know #[em something]
+    |  about it. Because spaCy uses a 4-layer convolutional network, the
+    |  tensors are sensitive to up to #[strong four words on either side] of a
+    |  word.
+
+p
+    |  For example, here are three sentences containing the out-of-vocabulary
+    |  word "labrador" in different contexts.
+
++code.
+    doc1 = nlp(u"The labrador barked.")
+    doc2 = nlp(u"The labrador swam.")
+    doc3 = nlp(u"the labrador people live in canada.")
+
+    for doc in [doc1, doc2, doc3]:
+        labrador = doc[1]
+        dog = nlp(u"dog")
+        print(labrador.similarity(dog))
+
+p
+    |  Even though the model has never seen the word "labrador", it can make a
+    |  fairly accurate prediction of its similarity to "dog" in different
+    |  contexts.
+
++table(["Context", "labrador.similarity(dog)"])
+    +row
+        +cell The #[strong labrador] barked.
+        +cell #[code 0.56] #[+procon("yes", "similar")]
+
+    +row
+        +cell The #[strong labrador] swam.
+        +cell #[code 0.48] #[+procon("no", "dissimilar")]
+
+    +row
+        +cell the #[strong labrador] people live in canada.
+        +cell #[code 0.39] #[+procon("no", "dissimilar")]
+
+p
+    |  The same also works for whole documents. Here, the variance of the
+    |  similarities is lower, as all words and their order are taken into
+    |  account. However, the context-specific similarity is often still
+    |  reflected pretty accurately.
+
++code.
+    doc1 = nlp(u"Paris is the largest city in France.")
+    doc2 = nlp(u"Vilnius is the capital of Lithuania.")
+    doc3 = nlp(u"An emu is a large bird.")
+
+    for doc in [doc1, doc2, doc3]:
+        for other_doc in [doc1, doc2, doc3]:
+            print(doc.similarity(other_doc))
+
+p
+    |  Even though the sentences about Paris and Vilnius consist of different
+    |  words and entities, they both describe the same concept and are seen as
+    |  more similar than the sentence about emus. In this case, even a misspelled
+    |  version of "Vilnius" would still produce very similar results.
+
++table
+    - var examples = {"Paris is the largest city in France.": [1, 0.85, 0.65], "Vilnius is the capital of Lithuania.": [0.85, 1, 0.55], "An emu is a large bird.": [0.65, 0.55, 1]}
+    - var counter = 0
+
+    +row
+    +row
+        +cell
+        for _, label in examples
+            +cell=label
+
+    each cells, label in examples
+        +row(counter ? null : "divider")
+            +cell=label
+            for cell in cells
+                +cell.u-text-center
+                    - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"]
+                    |  #[code=cell.toFixed(2)] #[+procon(...result)]
+        - counter++
+
+p
+    |  Sentences that consist of the same words in different order will likely
+    |  be seen as very similar – but never identical.
+
++code.
+    docs = [nlp(u"dog bites man"), nlp(u"man bites dog"),
+            nlp(u"man dog bites"), nlp(u"dog man bites")]
+
+    for doc in docs:
+        for other_doc in docs:
+            print(doc.similarity(other_doc))
+
+p
+    |  Interestingly, "man bites dog" and "man dog bites" are seen as slightly
+    |  more similar than "man bites dog" and "dog bites man". This may be a
+    |  conincidence – or the result of "man" being interpreted as both sentence's
+    |  subject.
+
++table
+    - var examples = {"dog bites man": [1, 0.9, 0.89, 0.92], "man bites dog": [0.9, 1, 0.93, 0.9], "man dog bites": [0.89, 0.93, 1, 0.92], "dog man bites": [0.92, 0.9, 0.92, 1]}
+    - var counter = 0
+
+    +row("head")
+        +cell
+        for _, label in examples
+            +cell.u-text-center=label
+
+    each cells, label in examples
+        +row(counter ? null : "divider")
+            +cell=label
+            for cell in cells
+                +cell.u-text-center
+                    - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"]
+                    |  #[code=cell.toFixed(2)] #[+procon(...result)]
+        - counter++
diff --git a/website/usage/_vectors-similarity/_custom.jade b/website/usage/_vectors-similarity/_custom.jade
index da4be39fd..7792949d1 100644
--- a/website/usage/_vectors-similarity/_custom.jade
+++ b/website/usage/_vectors-similarity/_custom.jade
@@ -1,49 +1,137 @@
 //- 💫 DOCS > USAGE > VECTORS & SIMILARITY > CUSTOM VECTORS
 
 p
-    |  By default, #[+api("token#vector") #[code Token.vector]] returns the
-    |  vector for its underlying #[+api("lexeme") #[code Lexeme]], while
-    |  #[+api("doc#vector") #[code Doc.vector]] and
-    |  #[+api("span#vector") #[code Span.vector]] return an average of the
-    |  vectors of their tokens. You can customize these
-    |  behaviours by modifying the #[code doc.user_hooks],
-    |  #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
-    |  dictionaries.
+    |  Word vectors let you import knowledge from raw text into your model. The
+    |  knowledge is represented as a table of numbers, with one row per term in
+    |  your vocabulary. If two terms are used in similar contexts, the algorithm
+    |  that learns the vectors should assign them
+    |  #[strong rows that are quite similar], while words that are used in
+    |  different contexts will have quite different values. This lets you use
+    |  the row-values assigned to the words as a kind of dictionary, to tell you
+    |  some things about what the words in your text mean.
 
-+infobox
-    |  For more details on #[strong adding hooks] and #[strong overwriting] the
-    |  built-in #[code Doc], #[code Span] and #[code Token] methods, see the
-    |  usage guide on #[+a("/usage/processing-pipelines#user-hooks") user hooks].
+p
+    |  Word vectors are particularly useful for terms which
+    |  #[strong aren&apos;t well represented in your labelled training data].
+    |  For instance, if you're doing named entity recognition, there will always
+    |  be lots of names that you don't have examples of. For instance, imagine
+    |  your training data happens to contain some examples of the term
+    |  "Microsoft", but it doesn't contain any examples of the term "Symantec".
+    |  In your raw text sample, there are plenty of examples of both terms, and
+    |  they're used in similar contexts. The word vectors make that fact
+    |  available to the entity recognition model. It still won't see examples of
+    |  "Symantec" labelled as a company. However, it'll see that "Symantec" has
+    |  a word vector that usually corresponds to company terms, so it can
+    |  #[strong make the inference].
+
+p
+    |  In order to make best use of the word vectors, you want the word vectors
+    |  table to cover a #[strong very large vocabulary]. However, most words are
+    |  rare, so most of the rows in a large word vectors table will be accessed
+    |  very rarely, or never at all. You can usually cover more than
+    |  #[strong 95% of the tokens] in your corpus with just
+    |  #[strong a few thousand rows] in the vector table. However, it's those
+    |  #[strong 5% of rare terms] where the word vectors are
+    |  #[strong most useful]. The problem is that increasing the size of the
+    |  vector table produces rapidly diminishing returns in coverage over these
+    |  rare terms.
+
++h(3, "custom-vectors-coverage") Optimising vector coverage
+    +tag-new(2)
+
+p
+    |  To help you strike a good balance between coverage and memory usage,
+    |  spaCy's #[+api("vectors") #[code Vectors]] class lets you map
+    |  #[strong multiple keys] to the #[strong same row] of the table. If
+    |  you're using the #[+api("cli#vocab") #[code spacy vocab]] command to
+    |  create a vocabulary, pruning the vectors will be taken care of
+    |  automatically. You can also do it manually in the following steps:
+
++list("numbers")
+    +item
+        |  Start with a #[strong word vectors model] that covers a huge
+        |  vocabulary. For instance, the
+        |  #[+a("/models/en#en_vectors_web_lg") #[code en_vectors_web_lg]] model
+        |  provides 300-dimensional GloVe vectors for over 1 million terms of
+        |  English.
+
+    +item
+        |  If your vocabulary has values set for the #[code Lexeme.prob]
+        |  attribute, the lexemes will be sorted by descending probability to
+        |  determine which vectors to prune. Otherwise, lexemes will be sorted
+        |  by their order in the #[code Vocab].
+
+    +item
+        |  Call #[+api("vocab#prune_vectors") #[code Vocab.prune_vectors]] with
+        |  the number of vectors you want to keep.
+
++code.
+    nlp = spacy.load('en_vectors_web_lg')
+    n_vectors = 105000  # number of vectors to keep
+    removed_words = nlp.vocab.prune_vectors(n_vectors)
+
+    assert len(nlp.vocab.vectors) &lt;= n_vectors  # unique vectors have been pruned
+    assert nlp.vocab.vectors.n_keys &gt; n_vectors  # but not the total entries
+
+p
+    |  #[+api("vocab#prune_vectors") #[code Vocab.prune_vectors]] reduces the
+    |  current vector table to a given number of unique entries, and returns a
+    |  dictionary containing the removed words, mapped to #[code (string, score)]
+    |  tuples, where #[code string] is the entry the removed word was mapped
+    |  to, and #[code score] the similarity score between the two words.
+
++code("Removed words").
+    {
+        'Shore': ('coast', 0.732257),
+        'Precautionary': ('caution', 0.490973),
+        'hopelessness': ('sadness', 0.742366),
+        'Continous': ('continuous', 0.732549),
+        'Disemboweled': ('corpse', 0.499432),
+        'biostatistician': ('scientist', 0.339724),
+        'somewheres': ('somewheres', 0.402736),
+        'observing': ('observe', 0.823096),
+        'Leaving': ('leaving', 1.0)
+    }
+
+p
+    |  In the example above, the vector for "Shore" was removed and remapped
+    |  to the vector of "coast", which is deemed about 73% similar. "Leaving"
+    |  was remapped to the vector of "leaving", which is identical.
 
 +h(3, "custom-vectors-add") Adding vectors
     +tag-new(2)
 
 p
-    |  The new #[+api("vectors") #[code Vectors]] class makes it easy to add
-    |  your own vectors to spaCy. Just like the #[+api("vocab") #[code Vocab]],
-    |  it is initialised with a #[+api("stringstore") #[code StringStore]] or
-    |  a list of strings.
+    |  spaCy's new #[+api("vectors") #[code Vectors]] class greatly improves the
+    |  way word vectors are stored, accessed and used. The data is stored in
+    |  two structures:
 
-+code("Adding vectors one-by-one").
-    from spacy.strings import StringStore
-    from spacy.vectors import Vectors
++list
+    +item
+        |  An array, which can be either on CPU or #[+a("#gpu") GPU].
 
-    vector_data = {'dog': numpy.random.uniform(-1, 1, (300,)),
-                   'cat': numpy.random.uniform(-1, 1, (300,)),
-                   'orange': numpy.random.uniform(-1, 1, (300,))}
-
-    vectors = Vectors(StringStore(), 300)
-    for word, vector in vector_data.items():
-        vectors.add(word, vector)
+    +item
+        |  A dictionary mapping string-hashes to rows in the table.
 
 p
-    |  You can also add the vector values directly on initialisation:
+    |  Keep in mind that the #[code Vectors] class itself has no
+    |  #[+api("stringstore") #[code StringStore]], so you have to store the
+    |  hash-to-string mapping separately. If you need to manage the strings,
+    |  you should use the #[code Vectors] via the
+    |  #[+api("vocab") #[code Vocab]] class, e.g. #[code vocab.vectors]. To
+    |  add vectors to the vocabulary, you can use the
+    |  #[+api("vocab#set_vector") #[code Vocab.set_vector]] method.
 
-+code("Adding vectors on initialisation").
-    from spacy.vectors import Vectors
++code("Adding vectors").
+    from spacy.vocab import Vocab
 
-    vector_table = numpy.zeros((3, 300), dtype='f')
-    vectors = Vectors([u'dog', u'cat', u'orange'], vector_table)
+    vector_data = {u'dog': numpy.random.uniform(-1, 1, (300,)),
+                   u'cat': numpy.random.uniform(-1, 1, (300,)),
+                   u'orange': numpy.random.uniform(-1, 1, (300,))}
+
+    vocab = Vocab()
+    for word, vector in vector_data.items():
+        vocab.set_vector(word, vector)
 
 +h(3, "custom-loading-glove") Loading GloVe vectors
     +tag-new(2)
@@ -89,3 +177,20 @@ p
     |  #[+api("vocab#set_vector") #[code set_vector]] method.
 
 +github("spacy", "examples/vectors_fast_text.py")
+
++h(3, "custom-similarity") Using custom similarity methods
+
+p
+    |  By default, #[+api("token#vector") #[code Token.vector]] returns the
+    |  vector for its underlying #[+api("lexeme") #[code Lexeme]], while
+    |  #[+api("doc#vector") #[code Doc.vector]] and
+    |  #[+api("span#vector") #[code Span.vector]] return an average of the
+    |  vectors of their tokens. You can customise these
+    |  behaviours by modifying the #[code doc.user_hooks],
+    |  #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
+    |  dictionaries.
+
++infobox
+    |  For more details on #[strong adding hooks] and #[strong overwriting] the
+    |  built-in #[code Doc], #[code Span] and #[code Token] methods, see the
+    |  usage guide on #[+a("/usage/processing-pipelines#user-hooks") user hooks].
diff --git a/website/usage/_vectors-similarity/_in-context.jade b/website/usage/_vectors-similarity/_in-context.jade
deleted file mode 100644
index becd74348..000000000
--- a/website/usage/_vectors-similarity/_in-context.jade
+++ /dev/null
@@ -1,123 +0,0 @@
-//- 💫 DOCS > USAGE > VECTORS & SIMILARITY > IN CONTEXT
-
-p
-    |  Aside from spaCy's built-in word vectors, which were trained on a lot of
-    |  text with a wide vocabulary, the parsing, tagging and NER models also
-    |  rely on vector representations of the #[strong meanings of words in context].
-    |  As the first component of the
-    |  #[+a("/usage/processing-pipelines") processing pipeline], the
-    |  tensorizer encodes a document's internal meaning representations as an
-    |  array of floats, also called a tensor. This allows spaCy to make a
-    |  reasonable guess at a word's meaning, based on its surrounding words.
-    |  Even if a word hasn't been seen before, spaCy will know #[em something]
-    |  about it. Because spaCy uses a 4-layer convolutional network, the
-    |  tensors are sensitive to up to #[strong four words on either side] of a
-    |  word.
-
-p
-    |  For example, here are three sentences containing the out-of-vocabulary
-    |  word "labrador" in different contexts.
-
-+code.
-    doc1 = nlp(u"The labrador barked.")
-    doc2 = nlp(u"The labrador swam.")
-    doc3 = nlp(u"the labrador people live in canada.")
-
-    for doc in [doc1, doc2, doc3]:
-        labrador = doc[1]
-        dog = nlp(u"dog")
-        print(labrador.similarity(dog))
-
-p
-    |  Even though the model has never seen the word "labrador", it can make a
-    |  fairly accurate prediction of its similarity to "dog" in different
-    |  contexts.
-
-+table(["Context", "labrador.similarity(dog)"])
-    +row
-        +cell The #[strong labrador] barked.
-        +cell #[code 0.56] #[+procon("yes", "similar")]
-
-    +row
-        +cell The #[strong labrador] swam.
-        +cell #[code 0.48] #[+procon("no", "dissimilar")]
-
-    +row
-        +cell the #[strong labrador] people live in canada.
-        +cell #[code 0.39] #[+procon("no", "dissimilar")]
-
-p
-    |  The same also works for whole documents. Here, the variance of the
-    |  similarities is lower, as all words and their order are taken into
-    |  account. However, the context-specific similarity is often still
-    |  reflected pretty accurately.
-
-+code.
-    doc1 = nlp(u"Paris is the largest city in France.")
-    doc2 = nlp(u"Vilnius is the capital of Lithuania.")
-    doc3 = nlp(u"An emu is a large bird.")
-
-    for doc in [doc1, doc2, doc3]:
-        for other_doc in [doc1, doc2, doc3]:
-            print(doc.similarity(other_doc))
-
-p
-    |  Even though the sentences about Paris and Vilnius consist of different
-    |  words and entities, they both describe the same concept and are seen as
-    |  more similar than the sentence about emus. In this case, even a misspelled
-    |  version of "Vilnius" would still produce very similar results.
-
-+table
-    - var examples = {"Paris is the largest city in France.": [1, 0.85, 0.65], "Vilnius is the capital of Lithuania.": [0.85, 1, 0.55], "An emu is a large bird.": [0.65, 0.55, 1]}
-    - var counter = 0
-
-    +row
-    +row
-        +cell
-        for _, label in examples
-            +cell=label
-
-    each cells, label in examples
-        +row(counter ? null : "divider")
-            +cell=label
-            for cell in cells
-                +cell.u-text-center
-                    - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"]
-                    |  #[code=cell.toFixed(2)] #[+procon(...result)]
-        - counter++
-
-p
-    |  Sentences that consist of the same words in different order will likely
-    |  be seen as very similar – but never identical.
-
-+code.
-    docs = [nlp(u"dog bites man"), nlp(u"man bites dog"),
-            nlp(u"man dog bites"), nlp(u"dog man bites")]
-
-    for doc in docs:
-        for other_doc in docs:
-            print(doc.similarity(other_doc))
-
-p
-    |  Interestingly, "man bites dog" and "man dog bites" are seen as slightly
-    |  more similar than "man bites dog" and "dog bites man". This may be a
-    |  conincidence – or the result of "man" being interpreted as both sentence's
-    |  subject.
-
-+table
-    - var examples = {"dog bites man": [1, 0.9, 0.89, 0.92], "man bites dog": [0.9, 1, 0.93, 0.9], "man dog bites": [0.89, 0.93, 1, 0.92], "dog man bites": [0.92, 0.9, 0.92, 1]}
-    - var counter = 0
-
-    +row("head")
-        +cell
-        for _, label in examples
-            +cell.u-text-center=label
-
-    each cells, label in examples
-        +row(counter ? null : "divider")
-            +cell=label
-            for cell in cells
-                +cell.u-text-center
-                    - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"]
-                    |  #[code=cell.toFixed(2)] #[+procon(...result)]
-        - counter++
diff --git a/website/usage/vectors-similarity.jade b/website/usage/vectors-similarity.jade
index 1e1139b20..fd70910ae 100644
--- a/website/usage/vectors-similarity.jade
+++ b/website/usage/vectors-similarity.jade
@@ -5,10 +5,6 @@ include ../_includes/_mixins
 +section("basics")
     include _vectors-similarity/_basics
 
-+section("in-context")
-    +h(2, "in-context") Similarities in context
-    include _vectors-similarity/_in-context
-
 +section("custom")
     +h(2, "custom") Customising word vectors
     include _vectors-similarity/_custom

From 5fd851a80be4b2d8c720bb066437dc9e9e3e2f23 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 01:46:50 +0100
Subject: [PATCH 71/90] Log errors

---
 website/assets/js/models.js | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/website/assets/js/models.js b/website/assets/js/models.js
index e79073edd..e05fabea0 100644
--- a/website/assets/js/models.js
+++ b/website/assets/js/models.js
@@ -240,7 +240,8 @@ export class ModelComparer {
         return data;
     }
 
-    showError() {
+    showError(err) {
+        console.error(err);
         this.tpl.get('result').style.display = 'none';
         this.tpl.get('error').style.display = 'block';
     }

From 092333afd469611bbb8abe00f554837d03a3c3fe Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 01:47:31 +0100
Subject: [PATCH 72/90] Update vector details and number conversion

---
 website/assets/js/models.js |  4 ++--
 website/assets/js/util.js   | 17 +++++++++++++++--
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/website/assets/js/models.js b/website/assets/js/models.js
index e05fabea0..8df49acc2 100644
--- a/website/assets/js/models.js
+++ b/website/assets/js/models.js
@@ -1,6 +1,6 @@
 'use strict';
 
-import { Templater, handleResponse, convertNumber } from './util.js';
+import { Templater, handleResponse, convertNumber, abbrNumber } from './util.js';
 
 /**
  * Chart.js defaults
@@ -25,7 +25,7 @@ export const formats = {
     license: (license, url) => url ? `<a href="${url}" target="_blank">${license}</a>` : license,
     sources: sources => (sources instanceof Array) ? sources.join(', ') : sources,
     pipeline: pipes => (pipes && pipes.length) ? pipes.map(p => `<code>${p}</code>`).join(', ') : '-',
-    vectors: vec => vec ? `${convertNumber(vec.entries)} (${vec.width} dimensions)` : 'n/a',
+    vectors: vec => vec ? `${abbrNumber(vec.keys)} keys, ${abbrNumber(vec.vectors)} unique vectors (${vec.width} dimensions)` : 'n/a',
     version: version => `<code>v${version}</code>`
 };
 
diff --git a/website/assets/js/util.js b/website/assets/js/util.js
index 6bf14f578..65d05774c 100644
--- a/website/assets/js/util.js
+++ b/website/assets/js/util.js
@@ -46,11 +46,24 @@ export const handleResponse = res => {
     else return ({ ok: res.ok })
 };
 
-
 /**
  * Convert a number to a string and add thousand separator.
  * @param {number|string} num - The number to convert.
  * @param {string} separator – Thousand separator.
  */
-export const convertNumber = (num, separator = ',') =>
+export const convertNumber = (num = 0, separator = ',') =>
     num.toString().replace(/\B(?=(\d{3})+(?!\d))/g, separator);
+
+/**
+ * Abbreviate a number, e.g. 14249930 --> 14.25m.
+ * @param {number|string} num - The number to convert.
+ * @param {number} fixed - Number of decimals.
+ */
+export const abbrNumber = (num = 0, fixed = 2) => {
+    const suffixes = ['', 'k', 'm', 'b', 't'];
+    if (num === null || num === 0) return 0;
+    const b = num.toPrecision(2).split('e');
+    const k = (b.length === 1) ? 0 : Math.floor(Math.min(b[1].slice(1), 14) / 3);
+    const c = (k < 1) ? num.toFixed(fixed) : (num / Math.pow(10, k * 3)).toFixed(fixed + 1);
+    return (c < 0 ? c : Math.abs(c)) + suffixes[k];
+}

From 3b7ec64caa179dd24d6124a4643124b514d7c9f1 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 01:52:45 +0100
Subject: [PATCH 73/90] Add PYTHONPATH to build from source quickstart

---
 website/usage/_install/_quickstart.jade | 1 +
 1 file changed, 1 insertion(+)

diff --git a/website/usage/_install/_quickstart.jade b/website/usage/_install/_quickstart.jade
index 8e581994c..b4ee10c4b 100644
--- a/website/usage/_install/_quickstart.jade
+++ b/website/usage/_install/_quickstart.jade
@@ -19,6 +19,7 @@
 
     +qs({package: 'source'}) git clone https://github.com/explosion/spaCy
     +qs({package: 'source'}) cd spaCy
+    +qs({package: 'source'}) export PYTHONPATH=`pwd`
     +qs({package: 'source'}) pip install -r requirements.txt
     +qs({package: 'source'}) pip install -e .
 

From f84660986a35502c908e6b76933ca508cdabc406 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 01:57:33 +0100
Subject: [PATCH 74/90] Update example sentences for models quickstart

---
 website/models/_data.json | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/website/models/_data.json b/website/models/_data.json
index 1120b0fcc..cb971e20c 100644
--- a/website/models/_data.json
+++ b/website/models/_data.json
@@ -115,6 +115,8 @@
         "de": "Dies ist ein Satz.",
         "fr": "C'est une phrase.",
         "es": "Esto es una frase.",
+        "pt": "Esta é uma frase.",
+        "it": "Questa è una frase.",
         "xx": "This is a sentence about Facebook."
     }
 }

From a6f6bd6c98c58a748cb5f64d1eee7cb49372c917 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 02:04:00 +0100
Subject: [PATCH 75/90] Adjust tag spacing

---
 website/assets/css/_base/_utilities.sass | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/assets/css/_base/_utilities.sass b/website/assets/css/_base/_utilities.sass
index 8c1e82706..9b1c0cedc 100644
--- a/website/assets/css/_base/_utilities.sass
+++ b/website/assets/css/_base/_utilities.sass
@@ -47,7 +47,7 @@
     font: 600 1.1rem/#{1} $font-secondary
     background: $color-theme
     color: $color-back
-    padding: 0.15em 0.5em 0.35em
+    padding: 2px 6px 4px
     border-radius: 1em
     text-transform: uppercase
     vertical-align: middle

From c48dd0e1d33416915d86d2728bd2d9545b1940b1 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 1 Nov 2017 02:06:58 +0100
Subject: [PATCH 76/90] Fix vector pruning

---
 spacy/vectors.pyx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 95378947a..a77fb2236 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -275,7 +275,10 @@ cdef class Vectors:
             sims = xp.dot(batch, vectors.T)
             best_rows[i:i+batch_size] = sims.argmax(axis=1)
             scores[i:i+batch_size] = sims.max(axis=1)
-        keys = self.find(rows=best_rows)
+
+        xp = get_array_module(self.data)
+        row2key = {row: key for key, row in self.key2row.items()}
+        keys = xp.asarray([row2key[row] for row in best_rows], dtype='uint64')
         return (keys, best_rows, scores)
 
     def from_glove(self, path):

From affd3404ab24b5143ba97b26c40a90dc4b1dcbc0 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 13:14:03 +0100
Subject: [PATCH 77/90] Remove old model command (now "vocab")

---
 spacy/__main__.py     |   3 +-
 spacy/cli/__init__.py |   1 -
 spacy/cli/model.py    | 140 ------------------------------------------
 3 files changed, 1 insertion(+), 143 deletions(-)
 delete mode 100644 spacy/cli/model.py

diff --git a/spacy/__main__.py b/spacy/__main__.py
index f4b5e6715..6b9b909fe 100644
--- a/spacy/__main__.py
+++ b/spacy/__main__.py
@@ -6,7 +6,7 @@ from __future__ import print_function
 if __name__ == '__main__':
     import plac
     import sys
-    from spacy.cli import download, link, info, package, train, convert, model
+    from spacy.cli import download, link, info, package, train, convert
     from spacy.cli import vocab, profile, evaluate, validate
     from spacy.util import prints
 
@@ -18,7 +18,6 @@ if __name__ == '__main__':
         'evaluate': evaluate,
         'convert': convert,
         'package': package,
-        'model': model,
         'vocab': vocab,
         'profile': profile,
         'validate': validate
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index b807480ca..2c7bc449b 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -6,6 +6,5 @@ from .profile import profile
 from .train import train
 from .evaluate import evaluate
 from .convert import convert
-from .model import model
 from .vocab import make_vocab as vocab
 from .validate import validate
diff --git a/spacy/cli/model.py b/spacy/cli/model.py
deleted file mode 100644
index bcc1626bc..000000000
--- a/spacy/cli/model.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-try:
-    import bz2
-    import gzip
-except ImportError:
-    pass
-import math
-from ast import literal_eval
-from pathlib import Path
-
-import numpy as np
-import spacy
-from preshed.counter import PreshCounter
-
-from .. import util
-from ..compat import fix_text
-
-
-def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data,
-          min_doc_freq=5, min_word_freq=200):
-    model_path = Path(model_dir)
-    freqs_path = Path(freqs_data)
-    clusters_path = Path(clusters_data) if clusters_data else None
-    vectors_path = Path(vectors_data) if vectors_data else None
-
-    check_dirs(freqs_path, clusters_path, vectors_path)
-    vocab = util.get_lang_class(lang).Defaults.create_vocab()
-    nlp = spacy.blank(lang)
-    vocab = nlp.vocab
-    probs, oov_prob = read_probs(
-        freqs_path, min_doc_freq=int(min_doc_freq), min_freq=int(min_doc_freq))
-    clusters = read_clusters(clusters_path) if clusters_path else {}
-    populate_vocab(vocab, clusters, probs, oov_prob)
-    add_vectors(vocab, vectors_path)
-    create_model(model_path, nlp)
-
-
-def add_vectors(vocab, vectors_path):
-    with bz2.BZ2File(vectors_path.as_posix()) as f:
-        num_words, dim = next(f).split()
-        vocab.clear_vectors(int(dim))
-        for line in f:
-            word_w_vector = line.decode("utf8").strip().split(" ")
-            word = word_w_vector[0]
-            vector = np.array([float(val) for val in word_w_vector[1:]])
-            if word in vocab:
-                vocab.set_vector(word, vector)
-
-
-def create_model(model_path, model):
-    if not model_path.exists():
-        model_path.mkdir()
-    model.to_disk(model_path.as_posix())
-
-
-def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
-    counts = PreshCounter()
-    total = 0
-    freqs_file = check_unzip(freqs_path)
-    for i, line in enumerate(freqs_file):
-        freq, doc_freq, key = line.rstrip().split('\t', 2)
-        freq = int(freq)
-        counts.inc(i + 1, freq)
-        total += freq
-    counts.smooth()
-    log_total = math.log(total)
-    freqs_file = check_unzip(freqs_path)
-    probs = {}
-    for line in freqs_file:
-        freq, doc_freq, key = line.rstrip().split('\t', 2)
-        doc_freq = int(doc_freq)
-        freq = int(freq)
-        if doc_freq >= min_doc_freq and freq >= min_freq and len(
-                key) < max_length:
-            word = literal_eval(key)
-            smooth_count = counts.smoother(int(freq))
-            probs[word] = math.log(smooth_count) - log_total
-    oov_prob = math.log(counts.smoother(0)) - log_total
-    return probs, oov_prob
-
-
-def read_clusters(clusters_path):
-    clusters = {}
-    with clusters_path.open() as f:
-        for line in f:
-            try:
-                cluster, word, freq = line.split()
-                word = fix_text(word)
-            except ValueError:
-                continue
-            # If the clusterer has only seen the word a few times, its
-            # cluster is unreliable.
-            if int(freq) >= 3:
-                clusters[word] = cluster
-            else:
-                clusters[word] = '0'
-    # Expand clusters with re-casing
-    for word, cluster in list(clusters.items()):
-        if word.lower() not in clusters:
-            clusters[word.lower()] = cluster
-        if word.title() not in clusters:
-            clusters[word.title()] = cluster
-        if word.upper() not in clusters:
-            clusters[word.upper()] = cluster
-    return clusters
-
-
-def populate_vocab(vocab, clusters, probs, oov_prob):
-    for word, prob in reversed(
-            sorted(list(probs.items()), key=lambda item: item[1])):
-        lexeme = vocab[word]
-        lexeme.prob = prob
-        lexeme.is_oov = False
-        # Decode as a little-endian string, so that we can do & 15 to get
-        # the first 4 bits. See _parse_features.pyx
-        if word in clusters:
-            lexeme.cluster = int(clusters[word][::-1], 2)
-        else:
-            lexeme.cluster = 0
-
-
-def check_unzip(file_path):
-    file_path_str = file_path.as_posix()
-    if file_path_str.endswith('gz'):
-        return gzip.open(file_path_str)
-    else:
-        return file_path.open()
-
-
-def check_dirs(freqs_data, clusters_data, vectors_data):
-    if not freqs_data.is_file():
-        util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
-    if clusters_data and not clusters_data.is_file():
-        util.sys_exit(
-            clusters_data.as_posix(), title="No Brown clusters file found")
-    if vectors_data and not vectors_data.is_file():
-        util.sys_exit(
-            vectors_data.as_posix(), title="No word vectors file found")

From bfe17b7df1192238b8c122cfd3b2c74bca9d0249 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 13:14:31 +0100
Subject: [PATCH 78/90] Fix begin_training if get_gold_tuples is None

---
 examples/training/train_intent_parser.py   | 2 +-
 examples/training/train_new_entity_type.py | 2 +-
 examples/training/train_parser.py          | 2 +-
 examples/training/train_tagger.py          | 2 +-
 examples/training/train_textcat.py         | 2 +-
 spacy/language.py                          | 4 +++-
 6 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py
index def0ed370..b51a4a10c 100644
--- a/examples/training/train_intent_parser.py
+++ b/examples/training/train_intent_parser.py
@@ -94,7 +94,7 @@ def main(model=None, output_dir=None, n_iter=100):
 
     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
     with nlp.disable_pipes(*other_pipes):  # only train parser
-        optimizer = nlp.begin_training(lambda: [])
+        optimizer = nlp.begin_training()
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             losses = {}
diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py
index 1c70f7c03..062191440 100644
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@@ -87,7 +87,7 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=50):
     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
     with nlp.disable_pipes(*other_pipes):  # only train NER
         random.seed(0)
-        optimizer = nlp.begin_training(lambda: [])
+        optimizer = nlp.begin_training()
         for itn in range(n_iter):
             losses = {}
             gold_parses = get_gold_parses(nlp.make_doc, TRAIN_DATA)
diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py
index e321fdb1e..9e1d10414 100644
--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@@ -64,7 +64,7 @@ def main(model=None, output_dir=None, n_iter=1000):
     # get names of other pipes to disable them during training
     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
     with nlp.disable_pipes(*other_pipes):  # only train parser
-        optimizer = nlp.begin_training(lambda: [])
+        optimizer = nlp.begin_training()
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             losses = {}
diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py
index 7508c2e66..95b9efcbf 100644
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@@ -61,7 +61,7 @@ def main(lang='en', output_dir=None, n_iter=25):
     tagger = nlp.create_pipe('tagger')
     nlp.add_pipe(tagger)
 
-    optimizer = nlp.begin_training(lambda: [])
+    optimizer = nlp.begin_training()
     for i in range(n_iter):
         random.shuffle(TRAIN_DATA)
         losses = {}
diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py
index fc9610a66..852635075 100644
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@@ -59,7 +59,7 @@ def main(model=None, output_dir=None, n_iter=20):
     # get names of other pipes to disable them during training
     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
     with nlp.disable_pipes(*other_pipes):  # only train textcat
-        optimizer = nlp.begin_training(lambda: [])
+        optimizer = nlp.begin_training()
         print("Training the model...")
         print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
         for i in range(n_iter):
diff --git a/spacy/language.py b/spacy/language.py
index 806172f36..01ffd07bf 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -436,8 +436,10 @@ class Language(object):
         **cfg: Config parameters.
         RETURNS: An optimizer
         """
+        if get_gold_tuples is None:
+            get_gold_tuples = lambda: []
         # Populate vocab
-        if get_gold_tuples is not None:
+        else:
             for _, annots_brackets in get_gold_tuples():
                 for annots, _ in annots_brackets:
                     for word in annots[1]:

From 0fbab8160d280e2df4d30836b4f91edd7febef32 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 13:14:43 +0100
Subject: [PATCH 79/90] Update GloVe vectors example

---
 website/usage/_vectors-similarity/_custom.jade | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/website/usage/_vectors-similarity/_custom.jade b/website/usage/_vectors-similarity/_custom.jade
index 7792949d1..a6a28058f 100644
--- a/website/usage/_vectors-similarity/_custom.jade
+++ b/website/usage/_vectors-similarity/_custom.jade
@@ -161,10 +161,16 @@ p
         +cell float64 (double)
 
 +code.
-    from spacy.vectors import Vectors
+    nlp = spacy.load('en')
+    nlp.vocab.vectors.from_glove('/path/to/vectors')
 
-    vectors = Vectors([], 128)
-    vectors.from_glove('/path/to/vectors')
+p
+    |  If your instance of #[code Language] already contains vectors, they will
+    |  be overwritten. To create your own GloVe vectors model package like
+    |  spaCy's #[+a("/models/en#en_vectors_web_lg") #[code en_vectors_web_lg]],
+    |  you can call #[+api("language#to_disk") #[code nlp.to_disk]], and then
+    |  package the model using the #[+api("cli#package") #[code package]]
+    |  command.
 
 +h(3, "custom-loading-other") Loading other vectors
     +tag-new(2)

From c047498f87e9892f198316bc8e42e35608eda8df Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 1 Nov 2017 13:24:47 +0100
Subject: [PATCH 80/90] Fix vectors test

---
 spacy/tests/vectors/test_vectors.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/tests/vectors/test_vectors.py b/spacy/tests/vectors/test_vectors.py
index ce183f9fd..a9eabc78d 100644
--- a/spacy/tests/vectors/test_vectors.py
+++ b/spacy/tests/vectors/test_vectors.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
 
 from ...vectors import Vectors
 from ...tokenizer import Tokenizer
+from ...strings import hash_string
 from ..util import add_vecs_to_vocab, get_doc
 
 import numpy
@@ -45,6 +46,7 @@ def test_init_vectors_with_shape(strings):
 
 def test_get_vector(strings, data):
     v = Vectors(data=data)
+    strings = [hash_string(s) for s in strings]
     for i, string in enumerate(strings):
         v.add(string, row=i)
     assert list(v[strings[0]]) == list(data[0])
@@ -55,6 +57,7 @@ def test_get_vector(strings, data):
 def test_set_vector(strings, data):
     orig = data.copy()
     v = Vectors(data=data)
+    strings = [hash_string(s) for s in strings]
     for i, string in enumerate(strings):
         v.add(string, row=i)
     assert list(v[strings[0]]) == list(orig[0])

From 301fb2bb60e1d2f0cb2097304298757987d2ffbb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 1 Nov 2017 13:25:12 +0100
Subject: [PATCH 81/90] Implement Span.n_lefts and Span.n_rights

---
 spacy/tokens/span.pyx | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index efe511089..49b892adb 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -474,17 +474,15 @@ cdef class Span:
         """RETURNS (int): The number of leftward immediate children of the
             span, in the syntactic dependency parse.
         """
-        # TODO: implement
         def __get__(self):
-            raise NotImplementedError
+            return len(list(self.lefts))
 
     property n_rights:
         """RETURNS (int): The number of rightward immediate children of the
             span, in the syntactic dependency parse.
         """
-        # TODO: implement
         def __get__(self):
-            raise NotImplementedError
+            return len(list(self.rights))
 
     property subtree:
         """Tokens that descend from tokens in the span, but fall outside it.

From 7e7116cdf74d4637141b2083d733e773c3f81c53 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 1 Nov 2017 13:25:44 +0100
Subject: [PATCH 82/90] Fix Doc.to_array when only one string attr provided

---
 spacy/tokens/doc.pyx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 7a2e95e4b..fb15323f5 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -28,7 +28,7 @@ from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
 from ..attrs cimport ENT_TYPE, SENT_START
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
 from ..util import normalize_slice
-from ..compat import is_config, copy_reg, pickle
+from ..compat import is_config, copy_reg, pickle, basestring_
 from .. import about
 from .. import util
 from .underscore import Underscore
@@ -571,7 +571,8 @@ cdef class Doc:
         cdef np.ndarray[attr_t, ndim=1] attr_ids
         cdef np.ndarray[attr_t, ndim=2] output
         # Handle scalar/list inputs of strings/ints for py_attr_ids
-        if not hasattr(py_attr_ids, '__iter__'):
+        if not hasattr(py_attr_ids, '__iter__') \
+        and not isinstance(py_attr_ids, basestring_):
             py_attr_ids = [py_attr_ids]
 
         # Allow strings, e.g. 'lemma' or 'LEMMA'

From 9e0ebee81ca1573ddca6e5a3de904279e0ee5c1f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 1 Nov 2017 13:27:14 +0100
Subject: [PATCH 83/90] Add Token.is_sent_start property, so can deprecate
 Token.sent_start

---
 spacy/tests/doc/test_token_api.py | 12 ++++++------
 spacy/tokens/token.pyx            | 24 ++++++++++++++++++++++--
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index c02904905..77a15fd43 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -155,13 +155,13 @@ def test_doc_token_api_head_setter(en_tokenizer):
     assert doc[2].left_edge.i == 0
 
 
-def test_sent_start(en_tokenizer):
+def test_is_sent_start(en_tokenizer):
     doc = en_tokenizer(u'This is a sentence. This is another.')
-    assert not doc[0].sent_start
-    assert not doc[5].sent_start
-    doc[5].sent_start = True
-    assert doc[5].sent_start
-    assert not doc[0].sent_start
+    assert doc[5].is_sent_start is None
+    doc[5].is_sent_start = True
+    assert doc[5].is_sent_start is True
+    # Backwards compatibility
+    assert doc[0].sent_start is False
     doc.is_parsed = True
     assert len(list(doc.sents)) == 2
 
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index c3d92b389..af88872fb 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -330,9 +330,29 @@ cdef class Token:
             return self.c.r_kids
 
     property sent_start:
-        # TODO: fix and document
+        # TODO deprecation warning
         def __get__(self):
-            return self.c.sent_start
+            # Handle broken backwards compatibility case: doc[0].sent_start
+            # was False.
+            if self.i == 0:
+                return False
+            else:
+                return self.sent_start
+
+        def __set__(self, value):
+            self.is_sent_start = value
+
+    property is_sent_start:
+        """RETURNS (bool / None): Whether the token starts a sentence.
+            None if unknown.
+        """
+        def __get__(self):
+            if self.c.sent_start == 0:
+                return None
+            elif self.c.sent_start < 0:
+                return False
+            else:
+                return True
 
         def __set__(self, value):
             if self.doc.is_parsed:

From a7bf38bf31f7d7c143aa2c324d411c22ac3bdf2a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 1 Nov 2017 13:57:25 +0100
Subject: [PATCH 84/90] Remove misleading comment on util.get_cuda_stream()

---
 spacy/util.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/spacy/util.py b/spacy/util.py
index a45d43c47..3fbd22aaf 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -236,8 +236,6 @@ def is_in_jupyter():
 
 
 def get_cuda_stream(require=False):
-    # TODO: Error and tell to install chainer if not found
-    # Requires GPU
     return CudaStream() if CudaStream is not None else None
 
 

From 9e429b5a8a42ab3e3b48e08c34b5bed16ed9708b Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 14:13:08 +0100
Subject: [PATCH 85/90] Update formatting of deprecation note

---
 website/api/_top-level/_spacy.jade | 18 +++++++-------
 website/api/language.jade          | 40 +++++++++++++++---------------
 website/api/matcher.jade           | 20 +++++++--------
 3 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/website/api/_top-level/_spacy.jade b/website/api/_top-level/_spacy.jade
index c6b342011..4b21bd78f 100644
--- a/website/api/_top-level/_spacy.jade
+++ b/website/api/_top-level/_spacy.jade
@@ -58,16 +58,16 @@ p
     nlp.from_disk(model_data_path)          #  load in model data
 
 +infobox("Deprecation note", "⚠️")
-    .o-block
-        |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
-        |  will also raise an error if no model could be loaded and never just
-        |  return an empty #[code Language] object. If you need a blank language,
-        |  you can use the new function #[+api("spacy#blank") #[code spacy.blank()]]
-        |  or import the class explicitly, e.g.
-        |  #[code from spacy.lang.en import English].
+    |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
+    |  will also raise an error if no model could be loaded and never just
+    |  return an empty #[code Language] object. If you need a blank language,
+    |  you can use the new function #[+api("spacy#blank") #[code spacy.blank()]]
+    |  or import the class explicitly, e.g.
+    |  #[code from spacy.lang.en import English].
 
-    +code-new nlp = spacy.load('/model')
-    +code-old nlp = spacy.load('en', path='/model')
+    +code-wrapper
+        +code-new nlp = spacy.load('/model')
+        +code-old nlp = spacy.load('en', path='/model')
 
 +h(3, "spacy.blank") spacy.blank
     +tag function
diff --git a/website/api/language.jade b/website/api/language.jade
index f86257f38..1bc9b601f 100644
--- a/website/api/language.jade
+++ b/website/api/language.jade
@@ -84,13 +84,13 @@ p
         +cell A container for accessing the annotations.
 
 +infobox("Deprecation note", "⚠️")
-    .o-block
-        |  Pipeline components to prevent from being loaded can now be added as
-        |  a list to #[code disable], instead of specifying one keyword argument
-        |  per component.
+    |  Pipeline components to prevent from being loaded can now be added as
+    |  a list to #[code disable], instead of specifying one keyword argument
+    |  per component.
 
-    +code-new doc = nlp(u"I don't want parsed", disable=['parser'])
-    +code-old doc = nlp(u"I don't want parsed", parse=False)
+    +code-wrapper
+        +code-new doc = nlp(u"I don't want parsed", disable=['parser'])
+        +code-old doc = nlp(u"I don't want parsed", parse=False)
 
 +h(2, "pipe") Language.pipe
     +tag method
@@ -533,15 +533,15 @@ p
         +cell The modified #[code Language] object.
 
 +infobox("Deprecation note", "⚠️")
-    .o-block
-        |  As of spaCy v2.0, the #[code save_to_directory] method has been
-        |  renamed to #[code to_disk], to improve consistency across classes.
-        |  Pipeline components to prevent from being loaded can now be added as
-        |  a list to #[code disable], instead of specifying one keyword argument
-        |  per component.
+    |  As of spaCy v2.0, the #[code save_to_directory] method has been
+    |  renamed to #[code to_disk], to improve consistency across classes.
+    |  Pipeline components to prevent from being loaded can now be added as
+    |  a list to #[code disable], instead of specifying one keyword argument
+    |  per component.
 
-    +code-new nlp = English().from_disk(disable=['tagger', 'ner'])
-    +code-old nlp = spacy.load('en', tagger=False, entity=False)
+    +code-wrapper
+        +code-new nlp = English().from_disk(disable=['tagger', 'ner'])
+        +code-old nlp = spacy.load('en', tagger=False, entity=False)
 
 +h(2, "to_bytes") Language.to_bytes
     +tag method
@@ -595,13 +595,13 @@ p Load state from a binary string.
         +cell The #[code Language] object.
 
 +infobox("Deprecation note", "⚠️")
-    .o-block
-        |  Pipeline components to prevent from being loaded can now be added as
-        |  a list to #[code disable], instead of specifying one keyword argument
-        |  per component.
+    |  Pipeline components to prevent from being loaded can now be added as
+    |  a list to #[code disable], instead of specifying one keyword argument
+    |  per component.
 
-    +code-new nlp = English().from_bytes(bytes, disable=['tagger', 'ner'])
-    +code-old nlp = English().from_bytes('en', tagger=False, entity=False)
+    +code-wrapper
+        +code-new nlp = English().from_bytes(bytes, disable=['tagger', 'ner'])
+        +code-old nlp = English().from_bytes('en', tagger=False, entity=False)
 
 +h(2, "attributes") Attributes
 
diff --git a/website/api/matcher.jade b/website/api/matcher.jade
index 35aba4cba..097ac7008 100644
--- a/website/api/matcher.jade
+++ b/website/api/matcher.jade
@@ -203,18 +203,18 @@ p
             |  dict describes a token.
 
 +infobox("Deprecation note", "⚠️")
-    .o-block
-        |  As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
-        |  are deprecated and have been replaced with a simpler
-        |  #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
-        |  patterns and a callback for a given match ID.
+    |  As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
+    |  are deprecated and have been replaced with a simpler
+    |  #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
+    |  patterns and a callback for a given match ID.
 
-    +code-new.
-        matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}])
+    +code-wrapper
+        +code-new.
+            matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}])
 
-    +code-old.
-        matcher.add_entity('GoogleNow', on_match=merge_phrases)
-        matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
+        +code-old.
+            matcher.add_entity('GoogleNow', on_match=merge_phrases)
+            matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
 
 +h(2, "remove") Matcher.remove
     +tag method

From 1c7313051f6968792c083cbb593f81d33fa3d563 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 14:13:22 +0100
Subject: [PATCH 86/90] Document Token.is_sent_start

---
 website/api/token.jade | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/website/api/token.jade b/website/api/token.jade
index f8fa15fe8..c75a8f2d0 100644
--- a/website/api/token.jade
+++ b/website/api/token.jade
@@ -393,6 +393,37 @@ p A sequence of all the token's syntactic descendents.
         +cell #[code Token]
         +cell A descendant token such that #[code self.is_ancestor(descendant)].
 
++h(2, "is_sent_start") Token.is_sent_start
+    +tag property
+    +tag-new(2)
+
+p
+    |  A boolean value indicating whether the token starts a sentence.
+    |  #[code None] if unknown.
+
++aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    assert doc[4].is_sent_start
+    assert not doc[5].is_sent_start
+
++table(["Name", "Type", "Description"])
+    +row("foot")
+        +cell returns
+        +cell bool
+        +cell Whether the token starts a sentence.
+
++infobox("Deprecation note", "⚠️")
+    |  As of spaCy v2.0, the #[code Token.sent_start] property is deprecated and
+    |  has been replaced with #[code Token.is_sent_start], which returns a
+    |  boolean value instead of a misleading #[code 0] for #[code False] and
+    |  #[code 1] for #[code True]. It also now returns #[code None] if the
+    |  answer is unknown, and fixes a quirk in the old logic that would always
+    |  set the property to #[code 0] for the first word of the document.
+
+    +code-wrapper
+        +code-new assert doc[4].is_sent_start == True
+        +code-old assert doc[4].sent_start == 1
+
 +h(2, "has_vector") Token.has_vector
     +tag property
     +tag-model("vectors")

From 5ab4e96144e7dcb2dfcadd6ce1fd6198547f021a Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 14:13:36 +0100
Subject: [PATCH 87/90] Update v2 guide and split into partials

---
 website/usage/_data.json          |   1 +
 website/usage/_v2/_features.jade  | 237 ++++++++++++++
 website/usage/_v2/_incompat.jade  | 141 ++++++++
 website/usage/_v2/_migrating.jade | 224 +++++++++++++
 website/usage/_v2/_summary.jade   |  74 +++++
 website/usage/v2.jade             | 519 +-----------------------------
 6 files changed, 682 insertions(+), 514 deletions(-)
 create mode 100644 website/usage/_v2/_features.jade
 create mode 100644 website/usage/_v2/_incompat.jade
 create mode 100644 website/usage/_v2/_migrating.jade
 create mode 100644 website/usage/_v2/_summary.jade

diff --git a/website/usage/_data.json b/website/usage/_data.json
index 498202695..fa7a1fcd2 100644
--- a/website/usage/_data.json
+++ b/website/usage/_data.json
@@ -79,6 +79,7 @@
         "title": "What's New in v2.0",
         "teaser": "New features, backwards incompatibilities and migration guide.",
         "menu": {
+            "Summary": "summary",
             "New features": "features",
             "Backwards Incompatibilities": "incompat",
             "Migrating from v1.x": "migrating",
diff --git a/website/usage/_v2/_features.jade b/website/usage/_v2/_features.jade
new file mode 100644
index 000000000..316f1d1dc
--- /dev/null
+++ b/website/usage/_v2/_features.jade
@@ -0,0 +1,237 @@
+//- 💫 DOCS > USAGE > WHAT'S NEW IN V2.0 > NEW FEATURES
+
+p
+    |  This section contains an overview of the most important
+    |  #[strong new features and improvements]. The #[+a("/api") API docs]
+    |  include additional  deprecation notes. New methods and functions that
+    |  were introduced in this version are marked with a
+    |  #[span.u-text-tag.u-text-tag--spaced v2.0] tag.
+
++h(3, "features-models") Convolutional neural network models
+
++aside-code("Example", "bash")
+    for model in ["en", "de", "fr", "es", "pt", "it"]
+        | spacy download #{model}  # default #{LANGUAGES[model]} model!{'\n'}
+    | spacy download xx_ent_wiki_sm  # multi-language NER
+
+p
+    |  spaCy v2.0 features new neural models for tagging,
+    |  parsing and entity recognition. The models have
+    |  been designed and implemented from scratch specifically for spaCy, to
+    |  give you an unmatched balance of speed, size and accuracy. The new
+    |  models are #[strong 10&times; smaller], #[strong 20% more accurate],
+    |  and #[strong just as fast] as the previous generation.
+    |  #[strong GPU usage] is now supported via
+    |  #[+a("http://chainer.org") Chainer]'s CuPy module.
+
++infobox
+    |  #[+label-inline Usage:] #[+a("/models") Models directory],
+    |  #[+a("/models/comparison") Models comparison],
+    |  #[+a("/usage/#gpu") Using spaCy with GPU]
+
++h(3, "features-pipelines") Improved processing pipelines
+
++aside-code("Example").
+    # Set custom attributes
+    Doc.set_extension('my_attr', default=False)
+    Token.set_extension('my_attr', getter=my_token_getter)
+    assert doc._.my_attr, token._.my_attr
+
+    # Add components to the pipeline
+    my_component = lambda doc: doc
+    nlp.add_pipe(my_component)
+
+p
+    |  It's now much easier to #[strong customise the pipeline] with your own
+    |  components: functions that receive a #[code Doc] object, modify and
+    |  return it. Extensions let you write any
+    |  #[strong attributes, properties and methods] to the #[code Doc],
+    |  #[code Token] and #[code Span]. You can add data, implement new
+    |  features, integrate other libraries with spaCy or plug in your own
+    |  machine learning models.
+
++image
+    include ../../assets/img/pipeline.svg
+
++infobox
+    |  #[+label-inline API:] #[+api("language") #[code Language]],
+    |  #[+api("doc#set_extension") #[code Doc.set_extension]],
+    |  #[+api("span#set_extension") #[code Span.set_extension]],
+    |  #[+api("token#set_extension") #[code Token.set_extension]]
+    |  #[+label-inline Usage:]
+    |  #[+a("/usage/processing-pipelines") Processing pipelines]
+    |  #[+label-inline Code:]
+    |  #[+src("/usage/examples#section-pipeline") Pipeline examples]
+
++h(3, "features-text-classification") Text classification
+
++aside-code("Example").
+    textcat = nlp.create_pipe('textcat')
+    nlp.add_pipe(textcat, last=True)
+    optimizer = nlp.begin_training()
+    for itn in range(100):
+        for doc, gold in train_data:
+            nlp.update([doc], [gold], sgd=optimizer)
+    doc = nlp(u'This is a text.')
+    print(doc.cats)
+
+p
+    |  spaCy v2.0 lets you add text categorization models to spaCy pipelines.
+    |  The model supports classification with multiple, non-mutually
+    |  exclusive labels – so multiple labels can apply at once. You can
+    |  change the model architecture rather easily, but by default, the
+    |  #[code TextCategorizer] class uses a convolutional neural network to
+    |  assign position-sensitive vectors to each word in the document.
+
++infobox
+    |  #[+label-inline API:] #[+api("textcategorizer") #[code TextCategorizer]],
+    |  #[+api("doc#attributes") #[code Doc.cats]],
+    |  #[+api("goldparse#attributes") #[code GoldParse.cats]]#[br]
+    |  #[+label-inline Usage:] #[+a("/usage/text-classification") Text classification]
+
++h(3, "features-hash-ids") Hash values instead of integer IDs
+
++aside-code("Example").
+    doc = nlp(u'I love coffee')
+    assert doc.vocab.strings[u'coffee'] == 3197928453018144401
+    assert doc.vocab.strings[3197928453018144401] == u'coffee'
+
+    beer_hash = doc.vocab.strings.add(u'beer')
+    assert doc.vocab.strings[u'beer'] == beer_hash
+    assert doc.vocab.strings[beer_hash] == u'beer'
+
+p
+    |  The #[+api("stringstore") #[code StringStore]] now resolves all strings
+    |  to hash values instead of integer IDs. This means that the string-to-int
+    |  mapping #[strong no longer depends on the vocabulary state], making a lot
+    |  of workflows much simpler, especially during training. Unlike integer IDs
+    |  in spaCy v1.x, hash values will #[strong always match] – even across
+    |  models. Strings can now be added explicitly using the new
+    |  #[+api("stringstore#add") #[code Stringstore.add]] method. A token's hash
+    |  is available via #[code token.orth].
+
++infobox
+    |  #[+label-inline API:] #[+api("stringstore") #[code StringStore]]
+    |  #[+label-inline Usage:] #[+a("/usage/spacy-101#vocab") Vocab, hashes and lexemes 101]
+
++h(3, "features-vectors") Improved word vectors support
+
++aside-code("Example").
+    for word, vector in vector_data:
+        nlp.vocab.set_vector(word, vector)
+    nlp.vocab.vectors.from_glove('/path/to/vectors')
+    # keep 10000 unique vectors and remap the rest
+    nlp.vocab.prune_vectors(10000)
+    nlp.to_disk('/model')
+
+p
+    |  The new #[+api("vectors") #[code Vectors]] class helps the
+    |  #[code Vocab] manage the vectors assigned to strings, and lets you
+    |  assign vectors individually, or
+    |  #[+a("/usage/vectors-similarity#custom-loading-glove") load in GloVe vectors]
+    |  from a directory. To help you strike a good balance between coverage
+    |  and memory usage, the #[code Vectors] class lets you map
+    |  #[strong multiple keys] to the #[strong same row] of the table. If
+    |  you're using the #[+api("cli#vocab") #[code spacy vocab]] command to
+    |  create a vocabulary, pruning the vectors will be taken care of
+    |  automatically. Otherwise, you can use the new
+    |  #[+api("vocab#prune_vectors") #[code Vocab.prune_vectors]].
+
++infobox
+    |  #[+label-inline API:] #[+api("vectors") #[code Vectors]],
+    |  #[+api("vocab") #[code Vocab]]
+    |  #[+label-inline Usage:] #[+a("/usage/vectors-similarity") Word vectors and semantic similarity]
+
++h(3, "features-serializer") Saving, loading and serialization
+
++aside-code("Example").
+    nlp = spacy.load('en') # shortcut link
+    nlp = spacy.load('en_core_web_sm') # package
+    nlp = spacy.load('/path/to/en') # unicode path
+    nlp = spacy.load(Path('/path/to/en')) # pathlib Path
+
+    nlp.to_disk('/path/to/nlp')
+    nlp = English().from_disk('/path/to/nlp')
+
+p
+    |  spay's serialization API has been made consistent across classes and
+    |  objects. All container classes, i.e. #[code Language], #[code Doc],
+    |  #[code Vocab] and #[code StringStore] now have a #[code to_bytes()],
+    |  #[code from_bytes()], #[code to_disk()] and #[code from_disk()] method
+    |  that supports the Pickle protocol.
+
+p
+    |  The improved #[code spacy.load] makes loading models easier and more
+    |  transparent. You can load a model by supplying its
+    |  #[+a("/usage/models#usage") shortcut link], the name of an installed
+    |  #[+a("/usage/saving-loading#generating") model package] or a path.
+    |  The #[code Language] class to initialise will be determined based on the
+    |  model's settings. For a blank language, you can import the class directly,
+    |  e.g. #[code from spacy.lang.en import English].
+
++infobox
+    |  #[+label-inline API:] #[+api("spacy#load") #[code spacy.load]]
+    |  #[+label-inline Usage:] #[+a("/usage/saving-loading") Saving and loading]
+
++h(3, "features-displacy") displaCy visualizer with Jupyter support
+
++aside-code("Example").
+    from spacy import displacy
+    doc = nlp(u'This is a sentence about Facebook.')
+    displacy.serve(doc, style='dep') # run the web server
+    html = displacy.render(doc, style='ent') # generate HTML
+
+p
+    |  Our popular dependency and named entity visualizers are now an official
+    |  part of the spaCy library. displaCy can run a simple web server, or
+    |  generate raw HTML markup or SVG files to be exported. You can pass in one
+    |  or more docs, and customise the style. displaCy also auto-detects whether
+    |  you're running #[+a("https://jupyter.org") Jupyter] and will render the
+    |  visualizations in your notebook.
+
++infobox
+    |  #[+label-inline API:] #[+api("displacy") #[code displacy]]
+    |  #[+label-inline Usage:] #[+a("/usage/visualizers") Visualizing spaCy]
+
++h(3, "features-language") Improved language data and lazy loading
+
+p
+    |  Language-specfic data now lives in its own submodule, #[code spacy.lang].
+    |  Languages are lazy-loaded, i.e. only loaded when you import a
+    |  #[code Language] class, or load a model that initialises one. This allows
+    |  languages to contain more custom data, e.g. lemmatizer lookup tables, or
+    |  complex regular expressions. The language data has also been tidied up
+    |  and simplified. spaCy now also supports simple lookup-based
+    |  lemmatization – and #[strong #{LANG_COUNT} languages] in total!
+
++infobox
+    |  #[+label-inline API:] #[+api("language") #[code Language]]
+    |  #[+label-inline Code:] #[+src(gh("spaCy", "spacy/lang")) #[code spacy/lang]]
+    |  #[+label-inline Usage:] #[+a("/usage/adding-languages") Adding languages]
+
++h(3, "features-matcher") Revised matcher API and phrase matcher
+
++aside-code("Example").
+    from spacy.matcher import Matcher, PhraseMatcher
+
+    matcher = Matcher(nlp.vocab)
+    matcher.add('HEARTS', None, [{'ORTH': '❤️', 'OP': '+'}])
+
+    phrasematcher = PhraseMatcher(nlp.vocab)
+    phrasematcher.add('OBAMA', None, nlp(u"Barack Obama"))
+
+p
+    |  Patterns can now be added to the matcher by calling
+    |  #[+api("matcher-add") #[code matcher.add()]] with a match ID, an optional
+    |  callback function to be invoked on each match, and one or more patterns.
+    |  This allows you to write powerful, pattern-specific logic using only one
+    |  matcher. For example, you might only want to merge some entity types,
+    |  and set custom flags for other matched patterns. The new
+    |  #[+api("phrasematcher") #[code PhraseMatcher]] lets you efficiently
+    |  match very large terminology lists using #[code Doc] objects as match
+    |  patterns.
+
++infobox
+    |  #[+label-inline API:] #[+api("matcher") #[code Matcher]],
+    |  #[+api("phrasematcher") #[code PhraseMatcher]]
+    |  #[+label-inline Usage:] #[+a("/usage/rule-based-matching") Rule-based matching]
diff --git a/website/usage/_v2/_incompat.jade b/website/usage/_v2/_incompat.jade
new file mode 100644
index 000000000..e7546e73c
--- /dev/null
+++ b/website/usage/_v2/_incompat.jade
@@ -0,0 +1,141 @@
+//- 💫 DOCS > USAGE > WHAT'S NEW IN V2.0 > BACKWARDS INCOMPATIBILITIES
+
++table(["Old", "New"])
+    +row
+        +cell
+            |  #[code spacy.en] etc.
+        +cell
+            |  #[code spacy.lang.en] etc.
+
+    +row
+        +cell #[code spacy.orth]
+        +cell #[code spacy.lang.xx.lex_attrs]
+
+    +row
+        +cell #[code spacy.syntax.iterators]
+        +cell #[code spacy.lang.xx.syntax_iterators]
+
+    +row
+        +cell #[code spacy.tagger.Tagger]
+        +cell #[code spacy.pipeline.Tagger]
+
+    +row
+        +cell #[code spacy.cli.model]
+        +cell #[+api("cli#vocab") #[code spacy.cli.vocab]]
+
+    +row
+        +cell #[code Language.save_to_directory]
+        +cell #[+api("language#to_disk") #[code Language.to_disk]]
+
+    +row
+        +cell #[code Language.end_training]
+        +cell #[+api("language#begin_training") #[code Language.begin_training]]
+
+    +row
+        +cell #[code Language.create_make_doc]
+        +cell #[+api("language#attributes") #[code Language.tokenizer]]
+
+    +row
+        +cell
+            |  #[code Vocab.load]
+            |  #[code Vocab.load_lexemes]
+        +cell
+            |  #[+api("vocab#from_disk") #[code Vocab.from_disk]]
+            |  #[+api("vocab#from_bytes") #[code Vocab.from_bytes]]
+
+    +row
+        +cell
+            |  #[code Vocab.dump]
+        +cell
+            |  #[+api("vocab#to_disk") #[code Vocab.to_disk]]#[br]
+            |  #[+api("vocab#to_bytes") #[code Vocab.to_bytes]]
+
+    +row
+        +cell
+            |  #[code Vocab.load_vectors]
+            |  #[code Vocab.load_vectors_from_bin_loc]
+        +cell
+            |  #[+api("vectors#from_disk") #[code Vectors.from_disk]]
+            |  #[+api("vectors#from_bytes") #[code Vectors.from_bytes]]
+            |  #[+api("vectors#from_glove") #[code Vectors.from_glove]]
+
+    +row
+        +cell
+            |  #[code Vocab.dump_vectors]
+        +cell
+            |  #[+api("vectors#to_disk") #[code Vectors.to_disk]]
+            |  #[+api("vectors#to_bytes") #[code Vectors.to_bytes]]
+
+    +row
+        +cell
+            |  #[code StringStore.load]
+        +cell
+            |  #[+api("stringstore#from_disk") #[code StringStore.from_disk]]
+            |  #[+api("stringstore#from_bytes") #[code StringStore.from_bytes]]
+
+    +row
+        +cell
+            |  #[code StringStore.dump]
+        +cell
+            |  #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
+            |  #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
+
+    +row
+        +cell #[code Tokenizer.load]
+        +cell
+            |  #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
+            |  #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
+
+    +row
+        +cell #[code Tagger.load]
+        +cell
+            |  #[+api("tagger#from_disk") #[code Tagger.from_disk]]
+            |  #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
+
+    +row
+        +cell #[code DependencyParser.load]
+        +cell
+            |  #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
+            |  #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
+
+    +row
+        +cell #[code EntityRecognizer.load]
+        +cell
+            |  #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
+            |  #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+
+    +row
+        +cell #[code Matcher.load]
+        +cell -
+
+    +row
+        +cell
+            |  #[code Matcher.add_pattern]
+            |  #[code Matcher.add_entity]
+        +cell
+            |  #[+api("matcher#add") #[code Matcher.add]]
+            |  #[+api("phrasematcher#add") #[code PhraseMatcher.add]]
+
+    +row
+        +cell #[code Matcher.get_entity]
+        +cell #[+api("matcher#get") #[code Matcher.get]]
+
+    +row
+        +cell #[code Matcher.has_entity]
+        +cell #[+api("matcher#has_key") #[code Matcher.has_key]]
+
+    +row
+        +cell #[code Doc.read_bytes]
+        +cell
+            |  #[+api("doc#to_bytes") #[code Doc.to_bytes]]
+            |  #[+api("doc#from_bytes") #[code Doc.from_bytes]]
+            |  #[+api("doc#to_disk") #[code Doc.to_disk]]
+            |  #[+api("doc#from_disk") #[code Doc.from_disk]]
+
+    +row
+        +cell #[code Token.is_ancestor_of]
+        +cell #[+api("token#is_ancestor") #[code Token.is_ancestor]]
+
+    +row
+        +cell #[code Span.sent_start]
+        +cell #[+api("span#is_sent_start") #[code Span.is_sent_start]]
diff --git a/website/usage/_v2/_migrating.jade b/website/usage/_v2/_migrating.jade
new file mode 100644
index 000000000..9bdf27411
--- /dev/null
+++ b/website/usage/_v2/_migrating.jade
@@ -0,0 +1,224 @@
+//- 💫 DOCS > USAGE > WHAT'S NEW IN V2.0 > MIGRATING FROM SPACY 1.X
+
+p
+    |  Because we'e made so many architectural changes to the library, we've
+    |  tried to #[strong keep breaking changes to a minimum]. A lot of projects
+    |  follow the philosophy that if you're going to break anything, you may as
+    |  well break everything. We think migration is easier if there's a logic to
+    |  what has changed. We've therefore followed a policy of avoiding
+    |  breaking changes to the #[code Doc], #[code Span] and #[code Token]
+    |  objects. This way, you can focus on only migrating the code that
+    |  does training, loading and serialization — in other words, code that
+    |  works with the #[code nlp] object directly. Code that uses the
+    |  annotations should continue to work.
+
++infobox("Important note", "⚠️")
+    |  If you've trained your own models, keep in mind that your train and
+    |  runtime inputs must match. This means you'll have to
+    |  #[strong retrain your models] with spaCy v2.0.
+
++h(3, "migrating-saving-loading") Saving, loading and serialization
+
+p
+    |  Double-check all calls to #[code spacy.load()] and make sure they don't
+    |  use the #[code path] keyword argument. If you're only loading in binary
+    |  data and not a model package that can construct its own #[code Language]
+    |  class and pipeline, you should now use the
+    |  #[+api("language#from_disk") #[code Language.from_disk()]] method.
+
++code-new.
+    nlp = spacy.load('/model')
+    nlp = English().from_disk('/model/data')
++code-old nlp = spacy.load('en', path='/model')
+
+p
+    |  Review all other code that writes state to disk or bytes.
+    |  All containers, now share the same, consistent API for saving and
+    |  loading. Replace saving with #[code to_disk()] or #[code to_bytes()], and
+    |  loading with #[code from_disk()] and #[code from_bytes()].
+
++code-new.
+    nlp.to_disk('/model')
+    nlp.vocab.to_disk('/vocab')
+
++code-old.
+    nlp.save_to_directory('/model')
+    nlp.vocab.dump('/vocab')
+
+p
+    |  If you've trained models with input from v1.x, you'll need to
+    |  #[strong retrain them] with spaCy v2.0. All previous models will not
+    |  be compatible with the new version.
+
++h(3, "migrating-languages") Processing pipelines and language data
+
+p
+    |  If you're importing language data or #[code Language] classes, make sure
+    |  to change your import statements to import from #[code spacy.lang]. If
+    |  you've added your own custom language, it needs to be moved to
+    |  #[code spacy/lang/xx] and adjusted accordingly.
+
+.o-block
+    +code-new from spacy.lang.en import English
+    +code-old from spacy.en import English
+
+p
+    |  If you've been using custom pipeline components, check out the new
+    |  guide on #[+a("/usage/language-processing-pipelines") processing pipelines].
+    |  Pipeline components are now #[code (name, func)] tuples. Appending
+    |  them to the pipeline still works – but the
+    |  #[+api("language#add_pipe") #[code add_pipe]] method now makes this
+    |  much more convenient. Methods for removing, renaming, replacing and
+    |  retrieving components have been added as well. Components can now
+    |  be disabled by passing a list of their names to the #[code disable]
+    |  keyword argument on load, or by using
+    |  #[+api("language#disable_pipes") #[code disable_pipes]] as a method
+    |  or contextmanager:
+
+.o-block
+    +code-new.
+        nlp = spacy.load('en', disable=['tagger', 'ner'])
+        with nlp.disable_pipes('parser'):
+            doc = nlp(u"I don't want parsed")
+    +code-old.
+        nlp = spacy.load('en', tagger=False, entity=False)
+        doc = nlp(u"I don't want parsed", parse=False)
+
+p
+    |  To add spaCy's built-in pipeline components to your pipeline,
+    |  you can still import and instantiate them directly – but it's more
+    |  convenient to use the new
+    |  #[+api("language#create_pipe") #[code create_pipe]] method with the
+    |  component name, i.e. #[code 'tagger'], #[code 'parser'], #[code 'ner']
+    |  or #[code 'textcat'].
+
++code-new.
+    tagger = nlp.create_pipe('tagger')
+    nlp.add_pipe(tagger, first=True)
+
++code-old.
+    from spacy.pipeline import Tagger
+    tagger = Tagger(nlp.vocab)
+    nlp.pipeline.insert(0, tagger)
+
++h(3, "migrating-training") Training
+
+p
+    |  All built-in pipeline components are now subclasses of
+    |  #[+api("pipe") #[code Pipe]] are fully trainable and serializable,
+    |  and follow the same API. Instead of updating the model and telling
+    |  spaCy when to #[em stop], you can now explicitly call
+    |  #[+api("language#begin_training") #[code begin_taining]], which
+    |  returns an optimizer you can pass into the
+    |  #[+api("language#update") #[code update]] function.
+
++code-new.
+    optimizer = nlp.begin_training()
+    for itn in range(1000):
+        for doc, gold in train_data:
+            nlp.update([doc], [gold], sgd=optimizer)
+    nlp.to_disk('/model')
++code-old.
+    for itn in range(1000):
+        for doc, gold in train_data:
+            nlp.update(doc, gold)
+    nlp.end_training()
+    nlp.save_to_directory('/model')
+
++h(3, "migrating-doc") Attaching custom data to the Doc
+
+p
+    |  Previously, you had to create a new container in order to attach custom
+    |  data to a #[code Doc] object. This often required converting the
+    | #[code Doc] objects to and from arrays. In spaCy v2.0, you can set your
+    |  own attributes, properties and methods on the #[code Doc], #[code Token]
+    |  and #[code Span] via
+    |  #[+a("/usage/processing-pipelines#custom-components-attributes") custom extensions].
+    |  This means that your application can – and should – only pass around
+    |  #[code Doc] objects and refer to them as the single source of truth.
+
++code-new.
+    Doc.set_extension('meta', getter=get_doc_meta)
+    doc_with_meta = nlp(u'This is a doc with meta data')
+    meta = doc._.meta
+
++code-old.
+    doc = nlp(u'This is a regular doc')
+    doc_array = doc.to_array(['ORTH', 'POS'])
+    doc_with_meta = {'doc_array': doc_array, 'meta': get_doc_meta(doc_array)}
+
+p
+    |  If you wrap your extension attributes in a
+    |  #[+a("/usage/processing-pipelines#custom-components") custom pipeline component],
+    |  they will be assigned automatically when you call #[code nlp] on a text.
+    |  If your application assigns custom data to spaCy's container objects,
+    |  or includes other utilities that interact with the pipeline, consider
+    |  moving this logic into its own extension module.
+
++code-new.
+    nlp.add_pipe(meta_component)
+    doc = nlp(u'Doc with a custom pipeline that assigns meta')
+    meta = doc._.meta
+
++code-old.
+    doc = nlp(u'Doc with a standard pipeline')
+    meta = get_meta(doc)
+
++h(3, "migrating-strings") Strings and hash values
+
+p
+    |  The change from integer IDs to hash values may not actually affect your
+    |  code very much. However, if you're adding strings to the vocab manually,
+    |  you now need to call #[+api("stringstore#add") #[code StringStore.add()]]
+    |  explicitly. You can also now be sure that the string-to-hash mapping will
+    |  always match across vocabularies.
+
++code-new.
+    nlp.vocab.strings.add(u'coffee')
+    nlp.vocab.strings[u'coffee']       # 3197928453018144401
+    other_nlp.vocab.strings[u'coffee'] # 3197928453018144401
+
++code-old.
+    nlp.vocab.strings[u'coffee']       # 3672
+    other_nlp.vocab.strings[u'coffee'] # 40259
+
++h(3, "migrating-matcher") Adding patterns and callbacks to the matcher
+
+p
+    |  If you're using the matcher, you can now add patterns in one step. This
+    |  should be easy to update – simply merge the ID, callback and patterns
+    |  into one call to #[+api("matcher#add") #[code Matcher.add()]]. The
+    |  matcher now also supports string keys, which saves you an extra import.
+    |  If you've been using #[strong acceptor functions], you'll need to move
+    |  this logic into the
+    |  #[+a("/usage/rule-based-matching#on_match") #[code on_match] callbacks].
+    |  The callback function is invoked on every match and will give you access to
+    |  the doc, the index of the current match and all total matches. This lets
+    |  you both accept or reject the match, and define the actions to be
+    |  triggered.
+
+.o-block
+    +code-new.
+        matcher.add('GoogleNow', merge_phrases, [{'ORTH': 'Google'}, {'ORTH': 'Now'}])
+
+    +code-old.
+        matcher.add_entity('GoogleNow', on_match=merge_phrases)
+        matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
+
+p
+    |  If you need to match large terminology lists, you can now also
+    |  use the #[+api("phrasematcher") #[code PhraseMatcher]], which accepts
+    |  #[code Doc] objects as match patterns and is more efficient than the
+    |  regular, rule-based matcher.
+
++code-new.
+    from spacy.matcher import PhraseMatcher
+    matcher = PhraseMatcher(nlp.vocab)
+    patterns = [nlp(text) for text in large_terminology_list]
+    matcher.add('PRODUCT', None, *patterns)
+
++code-old.
+    matcher = Matcher(nlp.vocab)
+    matcher.add_entity('PRODUCT')
+    for text in large_terminology_list
+        matcher.add_pattern('PRODUCT', [{ORTH: text}])
diff --git a/website/usage/_v2/_summary.jade b/website/usage/_v2/_summary.jade
new file mode 100644
index 000000000..84f238476
--- /dev/null
+++ b/website/usage/_v2/_summary.jade
@@ -0,0 +1,74 @@
+//- 💫 DOCS > USAGE > WHAT'S NEW IN V2.0 > SUMMARY
+
+p
+    |  We're very excited to finally introduce spaCy v2.0! On this page, you'll
+    |  find a summary of the new features, information on the backwards
+    |  incompatibilities, including a handy overview of what's been renamed or
+    |  deprecated. To help you make the most of v2.0, we also
+    |  #[strong re-wrote almost all of the usage guides and API docs], and added
+    |  more #[+a("/usage/examples") real-world examples]. If you're new to
+    |  spaCy, or just want to brush up on some NLP basics and the details of
+    |  the library, check out the
+    |  #[+a("/usage/spacy-101") spaCy 101 guide] that explains the most
+    |  important concepts with examples and illustrations.
+
++h(2, "summary") Summary
+
++grid.o-no-block
+    +grid-col("half")
+
+        p
+            |  This release features entirely new
+            |  #[strong deep learning-powered models] for spaCy's tagger,
+            |  parser and entity recognizer. The new models are
+            |  #[strong 10&times; smaller], #[strong 20% more accurate] and
+            |  just as fast as the previous generation.
+
+        p
+            |  We've also made several usability improvements that are
+            |  particularly helpful for #[strong production deployments].
+            |  spaCy v2 now fully supports the Pickle protocol, making it
+            |  easy to use spaCy with
+            |  #[+a("https://spark.apache.org/") Apache Spark]. The
+            |  string-to-integer mapping is #[strong no longer stateful],
+            |  making it easy to reconcile annotations made in different
+            |  processes. Models are smaller and use less memory, and the
+            |  APIs for serialization are now much more consistent. Custom
+            |  pipeline components let you modify the #[code Doc] at any
+            |  stage in the pipeline. You can now also add your own
+            |  custom attributes, properties and methods to the #[code Doc],
+            |  #[code Token] and #[code Span].
+
+    +table-of-contents
+        +item #[+a("#summary") Summary]
+        +item #[+a("#features") New features]
+        +item #[+a("#features-models") Neural network models]
+        +item #[+a("#features-pipelines") Improved processing pipelines]
+        +item #[+a("#features-text-classification") Text classification]
+        +item #[+a("#features-hash-ids") Hash values as IDs]
+        +item #[+a("#features-vectors") Improved word vectors support]
+        +item #[+a("#features-serializer") Saving, loading and serialization]
+        +item #[+a("#features-displacy") displaCy visualizer]
+        +item #[+a("#features-language") Language data and lazy loading]
+        +item #[+a("#features-matcher") Revised matcher API and phrase matcher]
+        +item #[+a("#incompat") Backwards incompatibilities]
+        +item #[+a("#migrating") Migrating from spaCy v1.x]
+        +item #[+a("#benchmarks") Benchmarks]
+
+p
+    |  The main usability improvements you'll notice in spaCy v2.0 are around
+    |  #[strong defining, training and loading your own models] and components.
+    |  The new neural network models make it much easier to train a model from
+    |  scratch, or update an existing model with a few examples. In v1.x, the
+    |  statistical models depended on the state of the #[code Vocab]. If you
+    |  taught the model a new word, you would have to save and load a lot of
+    |  data — otherwise the model wouldn't correctly recall the features of your
+    |  new example. That's no longer the case.
+
+p
+    |  Due to some clever use of hashing, the statistical models
+    |  #[strong never change size], even as they learn new vocabulary items.
+    |  The whole pipeline is also now fully differentiable. Even if you don't
+    |  have explicitly annotated data, you can update spaCy using all the
+    |  #[strong latest deep learning tricks] like adversarial training, noise
+    |  contrastive estimation or reinforcement learning.
diff --git a/website/usage/v2.jade b/website/usage/v2.jade
index f833468bf..8662a8fce 100644
--- a/website/usage/v2.jade
+++ b/website/usage/v2.jade
@@ -2,531 +2,22 @@
 
 include ../_includes/_mixins
 
-p
-    |  We're very excited to finally introduce spaCy v2.0! On this page, you'll
-    |  find a summary of the new features, information on the backwards
-    |  incompatibilities, including a handy overview of what's been renamed or
-    |  deprecated. To help you make the most of v2.0, we also
-    |  #[strong re-wrote almost all of the usage guides and API docs], and added
-    |  more real-world examples. If you're new to spaCy, or just want to brush
-    |  up on some NLP basics and the details of the library, check out
-    |  the #[+a("/usage/spacy-101") spaCy 101 guide] that explains the most
-    |  important concepts with examples and illustrations.
++section("summary")
+    include _v2/_summary
 
-+h(2, "summary") Summary
-
-+grid.o-no-block
-    +grid-col("half")
-
-        p This release features
-            |  entirely new #[strong deep learning-powered models] for spaCy's tagger,
-            |  parser and entity recognizer. The new models are #[strong 20x smaller]
-            |  than the linear models that have powered spaCy until now: from 300 MB to
-            |  only 15 MB.
-
-        p
-            |  We've also made several usability improvements that are
-            |  particularly helpful for #[strong production deployments]. spaCy
-            |  v2 now fully supports the Pickle protocol, making it easy to use
-            |  spaCy with #[+a("https://spark.apache.org/") Apache Spark]. The
-            |  string-to-integer mapping is #[strong no longer stateful], making
-            |  it easy to reconcile annotations made in different processes.
-            |  Models are smaller and use less memory, and the APIs for serialization
-            |  are now much more consistent.
-
-    +table-of-contents
-        +item #[+a("#summary") Summary]
-        +item #[+a("#features") New features]
-        +item #[+a("#features-models") Neural network models]
-        +item #[+a("#features-pipelines") Improved processing pipelines]
-        +item #[+a("#features-text-classification") Text classification]
-        +item #[+a("#features-hash-ids") Hash values instead of integer IDs]
-        +item #[+a("#features-serializer") Saving, loading and serialization]
-        +item #[+a("#features-displacy") displaCy visualizer]
-        +item #[+a("#features-language") Language data and lazy loading]
-        +item #[+a("#features-matcher") Revised matcher API and phrase matcher]
-        +item #[+a("#incompat") Backwards incompatibilities]
-        +item #[+a("#migrating") Migrating from spaCy v1.x]
-        +item #[+a("#benchmarks") Benchmarks]
-
-p
-    |  The main usability improvements you'll notice in spaCy v2.0 are around
-    |  #[strong defining, training and loading your own models] and components.
-    |  The new neural network models make it much easier to train a model from
-    |  scratch, or update an existing model with a few examples. In v1.x, the
-    |  statistical models depended on the state of the #[code Vocab]. If you
-    |  taught the model a new word, you would have to save and load a lot of
-    |  data — otherwise the model wouldn't correctly recall the features of your
-    |  new example. That's no longer the case.
-
-p
-    |  Due to some clever use of hashing, the statistical models
-    |  #[strong never change size], even as they learn new vocabulary items.
-    |  The whole pipeline is also now fully differentiable. Even if you don't
-    |  have explicitly annotated data, you can update spaCy using all the
-    |  #[strong latest deep learning tricks] like adversarial training, noise
-    |  contrastive estimation or reinforcement learning.
 
 +section("features")
     +h(2, "features") New features
-
-    p
-        |  This section contains an overview of the most important
-        |  #[strong new features and improvements]. The #[+a("/api") API docs]
-        |  include additional  deprecation notes. New methods and functions that
-        |  were introduced in this version are marked with a #[+tag-new(2)] tag.
-
-    +h(3, "features-models") Convolutional neural network models
-
-    +aside-code("Example", "bash").
-        spacy download en # default English model
-        spacy download de # default German model
-        spacy download fr # default French model
-        spacy download es # default Spanish model
-        spacy download xx_ent_wiki_sm # multi-language NER
-
-    p
-        |  spaCy v2.0 features new neural models for tagging,
-        |  parsing and entity recognition. The models have
-        |  been designed and implemented from scratch specifically for spaCy, to
-        |  give you an unmatched balance of speed, size and accuracy. The new
-        |  models are #[strong 10&times; smaller], #[strong 20% more accurate],
-        |  and #[strong just as fast] as the previous generation.
-        |  #[strong GPU usage] is now supported via
-        |  #[+a("http://chainer.org") Chainer]'s CuPy module.
-
-    +infobox
-        |  #[+label-inline Usage:] #[+a("/models") Models directory],
-        |  #[+a("/usage/#gpu") Using spaCy with GPU]
-
-    +h(3, "features-pipelines") Improved processing pipelines
-
-    +aside-code("Example").
-        # Set custom attributes
-        Doc.set_extension('my_attr', default=False)
-        Token.set_extension('my_attr', getter=my_token_getter)
-        assert doc._.my_attr, token._.my_attr
-
-        # Add components to the pipeline
-        my_component = lambda doc: doc
-        nlp.add_pipe(my_component)
-
-    p
-        |  It's now much easier to #[strong customise the pipeline] with your own
-        |  components: functions that receive a #[code Doc] object, modify and
-        |  return it. Extensions let you write any
-        |  #[strong attributes, properties and methods] to the #[code Doc],
-        |  #[code Token] and #[code Span]. You can add data, implement new
-        |  features, integrate other libraries with spaCy or plug in your own
-        |  machine learning models.
-
-    +image
-        include ../assets/img/pipeline.svg
-
-    +infobox
-        |  #[+label-inline API:] #[+api("language") #[code Language]],
-        |  #[+api("doc#set_extension") #[code Doc.set_extension]],
-        |  #[+api("span#set_extension") #[code Span.set_extension]],
-        |  #[+api("token#set_extension") #[code Token.set_extension]]
-        |  #[+label-inline Usage:]
-        |  #[+a("/usage/processing-pipelines") Processing pipelines]
-        |  #[+label-inline Code:]
-        |  #[+src("/usage/examples#section-pipeline") Pipeline examples]
-
-    +h(3, "features-text-classification") Text classification
-
-    +aside-code("Example").
-        from spacy.lang.en import English
-        nlp = English(pipeline=['tensorizer', 'tagger', 'textcat'])
-
-    p
-        |  spaCy v2.0 lets you add text categorization models to spaCy pipelines.
-        |  The model supports classification with multiple, non-mutually exclusive
-        |  labels – so multiple labels can apply at once. You can change the model
-        |  architecture rather easily, but by default, the #[code TextCategorizer]
-        |  class uses a convolutional neural network to assign position-sensitive
-        |  vectors to each word in the document.
-
-    +infobox
-        |  #[+label-inline API:] #[+api("textcategorizer") #[code TextCategorizer]],
-        |  #[+api("doc#attributes") #[code Doc.cats]],
-        |  #[+api("goldparse#attributes") #[code GoldParse.cats]]#[br]
-        |  #[+label-inline Usage:] #[+a("/usage/text-classification") Text classification]
-
-    +h(3, "features-hash-ids") Hash values instead of integer IDs
-
-    +aside-code("Example").
-        doc = nlp(u'I love coffee')
-        assert doc.vocab.strings[u'coffee'] == 3197928453018144401
-        assert doc.vocab.strings[3197928453018144401] == u'coffee'
-
-        beer_hash = doc.vocab.strings.add(u'beer')
-        assert doc.vocab.strings[u'beer'] == beer_hash
-        assert doc.vocab.strings[beer_hash] == u'beer'
-
-    p
-        |  The #[+api("stringstore") #[code StringStore]] now resolves all strings
-        |  to hash values instead of integer IDs. This means that the string-to-int
-        |  mapping #[strong no longer depends on the vocabulary state], making a lot
-        |  of workflows much simpler, especially during training. Unlike integer IDs
-        |  in spaCy v1.x, hash values will #[strong always match] – even across
-        |  models. Strings can now be added explicitly using the new
-        |  #[+api("stringstore#add") #[code Stringstore.add]] method. A token's hash
-        |  is available via #[code token.orth].
-
-    +infobox
-        |  #[+label-inline API:] #[+api("stringstore") #[code StringStore]]
-        |  #[+label-inline Usage:] #[+a("/usage/spacy-101#vocab") Vocab, hashes and lexemes 101]
-
-    +h(3, "features-serializer") Saving, loading and serialization
-
-    +aside-code("Example").
-        nlp = spacy.load('en') # shortcut link
-        nlp = spacy.load('en_core_web_sm') # package
-        nlp = spacy.load('/path/to/en') # unicode path
-        nlp = spacy.load(Path('/path/to/en')) # pathlib Path
-
-        nlp.to_disk('/path/to/nlp')
-        nlp = English().from_disk('/path/to/nlp')
-
-    p
-        |  spay's serialization API has been made consistent across classes and
-        |  objects. All container classes, i.e. #[code Language], #[code Doc],
-        |  #[code Vocab] and #[code StringStore] now have a #[code to_bytes()],
-        |  #[code from_bytes()], #[code to_disk()] and #[code from_disk()] method
-        |  that supports the Pickle protocol.
-
-    p
-        |  The improved #[code spacy.load] makes loading models easier and more
-        |  transparent. You can load a model by supplying its
-        |  #[+a("/usage/models#usage") shortcut link], the name of an installed
-        |  #[+a("/usage/saving-loading#generating") model package] or a path.
-        |  The #[code Language] class to initialise will be determined based on the
-        |  model's settings. For a blank language, you can import the class directly,
-        |  e.g. #[code from spacy.lang.en import English].
-
-    +infobox
-        |  #[+label-inline API:] #[+api("spacy#load") #[code spacy.load]]
-        |  #[+label-inline Usage:] #[+a("/usage/saving-loading") Saving and loading]
-
-    +h(3, "features-displacy") displaCy visualizer with Jupyter support
-
-    +aside-code("Example").
-        from spacy import displacy
-        doc = nlp(u'This is a sentence about Facebook.')
-        displacy.serve(doc, style='dep') # run the web server
-        html = displacy.render(doc, style='ent') # generate HTML
-
-    p
-        |  Our popular dependency and named entity visualizers are now an official
-        |  part of the spaCy library. displaCy can run a simple web server, or
-        |  generate raw HTML markup or SVG files to be exported. You can pass in one
-        |  or more docs, and customise the style. displaCy also auto-detects whether
-        |  you're running #[+a("https://jupyter.org") Jupyter] and will render the
-        |  visualizations in your notebook.
-
-    +infobox
-        |  #[+label-inline API:] #[+api("displacy") #[code displacy]]
-        |  #[+label-inline Usage:] #[+a("/usage/visualizers") Visualizing spaCy]
-
-    +h(3, "features-language") Improved language data and lazy loading
-
-    p
-        |  Language-specfic data now lives in its own submodule, #[code spacy.lang].
-        |  Languages are lazy-loaded, i.e. only loaded when you import a
-        |  #[code Language] class, or load a model that initialises one. This allows
-        |  languages to contain more custom data, e.g. lemmatizer lookup tables, or
-        |  complex regular expressions. The language data has also been tidied up
-        |  and simplified. spaCy now also supports simple lookup-based lemmatization.
-
-    +infobox
-        |  #[+label-inline API:] #[+api("language") #[code Language]]
-        |  #[+label-inline Code:] #[+src(gh("spaCy", "spacy/lang")) #[code spacy/lang]]
-        |  #[+label-inline Usage:] #[+a("/usage/adding-languages") Adding languages]
-
-    +h(3, "features-matcher") Revised matcher API and phrase matcher
-
-    +aside-code("Example").
-        from spacy.matcher import Matcher, PhraseMatcher
-
-        matcher = Matcher(nlp.vocab)
-        matcher.add('HEARTS', None, [{'ORTH': '❤️', 'OP': '+'}])
-
-        phrasematcher = PhraseMatcher(nlp.vocab)
-        phrasematcher.add('OBAMA', None, nlp(u"Barack Obama"))
-
-    p
-        |  Patterns can now be added to the matcher by calling
-        |  #[+api("matcher-add") #[code matcher.add()]] with a match ID, an optional
-        |  callback function to be invoked on each match, and one or more patterns.
-        |  This allows you to write powerful, pattern-specific logic using only one
-        |  matcher. For example, you might only want to merge some entity types,
-        |  and set custom flags for other matched patterns. The new
-        |  #[+api("phrasematcher") #[code PhraseMatcher]] lets you efficiently
-        |  match very large terminology lists using #[code Doc] objects as match
-        |  patterns.
-
-    +infobox
-        |  #[+label-inline API:] #[+api("matcher") #[code Matcher]],
-        |  #[+api("phrasematcher") #[code PhraseMatcher]]
-        |  #[+label-inline Usage:] #[+a("/usage/rule-based-matching") Rule-based matching]
+    include _v2/_features
 
 +section("incompat")
     +h(2, "incompat") Backwards incompatibilities
-
-    +table(["Old", "New"])
-        +row
-            +cell
-                |  #[code spacy.en]
-                |  #[code spacy.xx]
-            +cell
-                |  #[code spacy.lang.en]
-                |  #[code spacy.lang.xx]
-
-        +row
-            +cell #[code orth]
-            +cell #[code lang.xx.lex_attrs]
-
-        +row
-            +cell #[code syntax.iterators]
-            +cell #[code lang.xx.syntax_iterators]
-
-        +row
-            +cell #[code Language.save_to_directory]
-            +cell #[+api("language#to_disk") #[code Language.to_disk]]
-
-        +row
-            +cell #[code Language.create_make_doc]
-            +cell #[+api("language#attributes") #[code Language.tokenizer]]
-
-        +row
-            +cell
-                |  #[code Vocab.load]
-                |  #[code Vocab.load_lexemes]
-            +cell
-                |  #[+api("vocab#from_disk") #[code Vocab.from_disk]]
-                |  #[+api("vocab#from_bytes") #[code Vocab.from_bytes]]
-
-        +row
-            +cell
-                |  #[code Vocab.dump]
-            +cell
-                |  #[+api("vocab#to_disk") #[code Vocab.to_disk]]#[br]
-                |  #[+api("vocab#to_bytes") #[code Vocab.to_bytes]]
-
-        +row
-            +cell
-                |  #[code Vocab.load_vectors]
-                |  #[code Vocab.load_vectors_from_bin_loc]
-            +cell
-                |  #[+api("vectors#from_disk") #[code Vectors.from_disk]]
-                |  #[+api("vectors#from_bytes") #[code Vectors.from_bytes]]
-
-        +row
-            +cell
-                |  #[code Vocab.dump_vectors]
-            +cell
-                |  #[+api("vectors#to_disk") #[code Vectors.to_disk]]
-                |  #[+api("vectors#to_bytes") #[code Vectors.to_bytes]]
-
-        +row
-            +cell
-                |  #[code StringStore.load]
-            +cell
-                |  #[+api("stringstore#from_disk") #[code StringStore.from_disk]]
-                |  #[+api("stringstore#from_bytes") #[code StringStore.from_bytes]]
-
-        +row
-            +cell
-                |  #[code StringStore.dump]
-            +cell
-                |  #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
-                |  #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
-
-        +row
-            +cell #[code Tokenizer.load]
-            +cell
-                |  #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
-                |  #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
-
-        +row
-            +cell #[code Tagger.load]
-            +cell
-                |  #[+api("tagger#from_disk") #[code Tagger.from_disk]]
-                |  #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
-
-        +row
-            +cell #[code DependencyParser.load]
-            +cell
-                |  #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
-                |  #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
-
-        +row
-            +cell #[code EntityRecognizer.load]
-            +cell
-                |  #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
-                |  #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
-
-        +row
-            +cell #[code Matcher.load]
-            +cell -
-
-        +row
-            +cell
-                |  #[code Matcher.add_pattern]
-                |  #[code Matcher.add_entity]
-            +cell #[+api("matcher#add") #[code Matcher.add]]
-
-        +row
-            +cell #[code Matcher.get_entity]
-            +cell #[+api("matcher#get") #[code Matcher.get]]
-
-        +row
-            +cell #[code Matcher.has_entity]
-            +cell #[+api("matcher#contains") #[code Matcher.__contains__]]
-
-        +row
-            +cell #[code Doc.read_bytes]
-            +cell
-                |  #[+api("doc#to_bytes") #[code Doc.to_bytes]]
-                |  #[+api("doc#from_bytes") #[code Doc.from_bytes]]
-                |  #[+api("doc#to_disk") #[code Doc.to_disk]]
-                |  #[+api("doc#from_disk") #[code Doc.from_disk]]
-
-        +row
-            +cell #[code Token.is_ancestor_of]
-            +cell #[+api("token#is_ancestor") #[code Token.is_ancestor]]
-
-        +row
-            +cell #[code cli.model]
-            +cell -
+    include _v2/_incompat
 
 +section("migrating")
     +h(2, "migrating") Migrating from spaCy 1.x
-
-    p
-        |  Because we'e made so many architectural changes to the library, we've
-        |  tried to #[strong keep breaking changes to a minimum]. A lot of projects
-        |  follow the philosophy that if you're going to break anything, you may as
-        |  well break everything. We think migration is easier if there's a logic to
-        |  what has changed.
-
-    p
-        |  We've therefore followed a policy of avoiding breaking changes to the
-        |  #[code Doc], #[code Span] and #[code Token] objects. This way, you can
-        |  focus on only migrating the code that does training, loading and
-        |  serialization — in other words, code that works with the #[code nlp]
-        |  object directly. Code that uses the annotations should continue to work.
-
-    +infobox("Important note")
-        |  If you've trained your own models, keep in mind that your train and
-        |  runtime inputs must match. This means you'll have to
-        |  #[strong retrain your models] with spaCy v2.0.
-
-    +h(3, "migrating-saving-loading") Saving, loading and serialization
-
-    p
-        |  Double-check all calls to #[code spacy.load()] and make sure they don't
-        |  use the #[code path] keyword argument. If you're only loading in binary
-        |  data and not a model package that can construct its own #[code Language]
-        |  class and pipeline, you should now use the
-        |  #[+api("language#from_disk") #[code Language.from_disk()]] method.
-
-    +code-new.
-        nlp = spacy.load('/model')
-        nlp = English().from_disk('/model/data')
-    +code-old nlp = spacy.load('en', path='/model')
-
-    p
-        |  Review all other code that writes state to disk or bytes.
-        |  All containers, now share the same, consistent API for saving and
-        |  loading. Replace saving with #[code to_disk()] or #[code to_bytes()], and
-        |  loading with #[code from_disk()] and #[code from_bytes()].
-
-    +code-new.
-        nlp.to_disk('/model')
-        nlp.vocab.to_disk('/vocab')
-
-    +code-old.
-        nlp.save_to_directory('/model')
-        nlp.vocab.dump('/vocab')
-
-    p
-        |  If you've trained models with input from v1.x, you'll need to
-        |  #[strong retrain them] with spaCy v2.0. All previous models will not
-        |  be compatible with the new version.
-
-    +h(3, "migrating-strings") Strings and hash values
-
-    p
-        |  The change from integer IDs to hash values may not actually affect your
-        |  code very much. However, if you're adding strings to the vocab manually,
-        |  you now need to call #[+api("stringstore#add") #[code StringStore.add()]]
-        |  explicitly. You can also now be sure that the string-to-hash mapping will
-        |  always match across vocabularies.
-
-    +code-new.
-        nlp.vocab.strings.add(u'coffee')
-        nlp.vocab.strings[u'coffee']       # 3197928453018144401
-        other_nlp.vocab.strings[u'coffee'] # 3197928453018144401
-
-    +code-old.
-        nlp.vocab.strings[u'coffee']       # 3672
-        other_nlp.vocab.strings[u'coffee'] # 40259
-
-    +h(3, "migrating-languages") Processing pipelines and language data
-
-    p
-        |  If you're importing language data or #[code Language] classes, make sure
-        |  to change your import statements to import from #[code spacy.lang]. If
-        |  you've added your own custom language, it needs to be moved to
-        |  #[code spacy/lang/xx] and adjusted accordingly.
-
-    +code-new from spacy.lang.en import English
-    +code-old from spacy.en import English
-
-    p
-        |  If you've been using custom pipeline components, check out the new
-        |  guide on #[+a("/usage/language-processing-pipelines") processing pipelines].
-        |  Appending functions to the pipeline still works – but the
-        |  #[+api("language#add_pipe") #[code add_pipe]] methods now makes this
-        |  much more convenient. Components of the processing pipeline can now
-        |  be disabled by passing a list of their names to the #[code disable]
-        |  keyword argument on load, or by simply demoving them from the
-        |  pipeline alltogether.
-
-    +code-new.
-        nlp = spacy.load('en', disable=['tagger', 'ner'])
-        doc = nlp(u"I don't want parsed", disable['parser'])
-        nlp.remove_pipe('parser')
-    +code-old.
-        nlp = spacy.load('en', tagger=False, entity=False)
-        doc = nlp(u"I don't want parsed", parse=False)
-
-    +h(3, "migrating-matcher") Adding patterns and callbacks to the matcher
-
-    p
-        |  If you're using the matcher, you can now add patterns in one step. This
-        |  should be easy to update – simply merge the ID, callback and patterns
-        |  into one call to #[+api("matcher#add") #[code Matcher.add()]].
-
-    +code-new.
-        matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}])
-
-    +code-old.
-        matcher.add_entity('GoogleNow', on_match=merge_phrases)
-        matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
-
-    p
-        |  If you've been using #[strong acceptor functions], you'll need to move
-        |  this logic into the
-        |  #[+a("/usage/rule-based-matching#on_match") #[code on_match] callbacks].
-        |  The callback function is invoked on every match and will give you access to
-        |  the doc, the index of the current match and all total matches. This lets
-        |  you both accept or reject the match, and define the actions to be
-        |  triggered.
+    include _v2/_migrating
 
 +section("benchmarks")
     +h(2, "benchmarks") Benchmarks
-
     include _facts-figures/_benchmarks-models

From e7a9174877720b314cf121ba01f0e5e13f76698e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 1 Nov 2017 16:32:44 +0100
Subject: [PATCH 88/90] Add add_label methods to Tagger and TextCategorizer

---
 spacy/pipeline.pyx | 48 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index 842e27069..a2321d1ad 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -11,9 +11,9 @@ import ujson
 import msgpack
 
 from thinc.api import chain
-from thinc.v2v import Softmax
+from thinc.v2v import Affine, Softmax
 from thinc.t2v import Pooling, max_pool, mean_pool
-from thinc.neural.util import to_categorical
+from thinc.neural.util import to_categorical, copy_array
 from thinc.neural._classes.difference import Siamese, CauchySimilarity
 
 from .tokens.doc cimport Doc
@@ -130,6 +130,15 @@ class Pipe(object):
         documents and their predicted scores."""
         raise NotImplementedError
 
+    def add_label(self, label):
+        """Add an output label, to be predicted by the model.
+
+        It's possible to extend pre-trained models with new labels,
+        but care should be taken to avoid the "catastrophic forgetting"
+        problem.
+        """
+        raise NotImplementedError
+
     def begin_training(self, gold_tuples=tuple(), pipeline=None):
         """Initialize the pipe for training, using data exampes if available.
         If no model has been initialized yet, the model is added."""
@@ -325,6 +334,14 @@ class Tagger(Pipe):
         self.cfg.setdefault('pretrained_dims',
                             self.vocab.vectors.data.shape[1])
 
+    @property
+    def labels(self):
+        return self.cfg.setdefault('tag_names', [])
+
+    @labels.setter
+    def labels(self, value):
+        self.cfg['tag_names'] = value
+
     def __call__(self, doc):
         tags = self.predict([doc])
         self.set_annotations([doc], tags)
@@ -352,6 +369,7 @@ class Tagger(Pipe):
         cdef Doc doc
         cdef int idx = 0
         cdef Vocab vocab = self.vocab
+        tags = list(self.labels)
         for i, doc in enumerate(docs):
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, 'get'):
@@ -359,7 +377,7 @@ class Tagger(Pipe):
             for j, tag_id in enumerate(doc_tag_ids):
                 # Don't clobber preset POS tags
                 if doc.c[j].tag == 0 and doc.c[j].pos == 0:
-                    vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
+                    vocab.morphology.assign_tag(&doc.c[j], tags[tag_id])
                 idx += 1
         doc.is_tagged = True
 
@@ -420,6 +438,17 @@ class Tagger(Pipe):
     def Model(cls, n_tags, **cfg):
         return build_tagger_model(n_tags, **cfg)
 
+    def add_label(self, label):
+        if label in self.labels:
+            return 0
+        smaller = self.model[-1]._layers[-1]
+        larger = Softmax(len(self.labels)+1, smaller.nI)
+        copy_array(larger.W[:smaller.nO], smaller.W)
+        copy_array(larger.b[:smaller.nO], smaller.b)
+        self.model[-1]._layers[-1] = larger
+        self.labels.append(label)
+        return 1
+
     def use_params(self, params):
         with self.model.use_params(params):
             yield
@@ -675,7 +704,7 @@ class TextCategorizer(Pipe):
 
     @property
     def labels(self):
-        return self.cfg.get('labels', ['LABEL'])
+        return self.cfg.setdefault('labels', ['LABEL'])
 
     @labels.setter
     def labels(self, value):
@@ -727,6 +756,17 @@ class TextCategorizer(Pipe):
         mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
         return mean_square_error, d_scores
 
+    def add_label(self, label):
+        if label in self.labels:
+            return 0
+        smaller = self.model[-1]._layers[-1]
+        larger = Affine(len(self.labels)+1, smaller.nI)
+        copy_array(larger.W[:smaller.nO], smaller.W)
+        copy_array(larger.b[:smaller.nO], smaller.b)
+        self.model[-1]._layers[-1] = larger
+        self.labels.append(label)
+        return 1
+
     def begin_training(self, gold_tuples=tuple(), pipeline=None):
         if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
             token_vector_width = pipeline[0].model.nO

From 9f9439667bdd8f1a002794bf0e49cc785a8b19bb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 1 Nov 2017 16:34:09 +0100
Subject: [PATCH 89/90] Don't create low-data text classifier if no vectors

---
 spacy/_ml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index 6bfacb20a..89e3d8ac6 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -434,7 +434,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
     pretrained_dims = cfg.get('pretrained_dims', 0)
     with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
                                  '**': clone}):
-        if cfg.get('low_data'):
+        if cfg.get('low_data') and pretrained_dims:
             model = (
                 SpacyVectors
                 >> flatten_add_lengths

From dad8f09fba1fe00cf24f3ab5d920bbe4168f4709 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 1 Nov 2017 16:34:31 +0100
Subject: [PATCH 90/90] Fix print statements in text classifier example

---
 examples/training/train_textcat.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py
index 852635075..6fa79e75b 100644
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@@ -26,8 +26,9 @@ from spacy.pipeline import TextCategorizer
 @plac.annotations(
     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
     output_dir=("Optional output directory", "option", "o", Path),
+    n_examples=("Number of texts to train from", "option", "N", int),
     n_iter=("Number of training iterations", "option", "n", int))
-def main(model=None, output_dir=None, n_iter=20):
+def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
     if model is not None:
         nlp = spacy.load(model)  # load existing spaCy model
         print("Loaded model '%s'" % model)
@@ -50,7 +51,8 @@ def main(model=None, output_dir=None, n_iter=20):
 
     # load the IMBD dataset
     print("Loading IMDB data...")
-    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000)
+    print("Using %d training examples" % n_texts)
+    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
     train_docs = [nlp.tokenizer(text) for text in train_texts]
     train_gold = [GoldParse(doc, cats=cats) for doc, cats in
                   zip(train_docs, train_cats)]
@@ -65,14 +67,14 @@ def main(model=None, output_dir=None, n_iter=20):
         for i in range(n_iter):
             losses = {}
             # batch up the examples using spaCy's minibatch
-            batches = minibatch(train_data, size=compounding(4., 128., 1.001))
+            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
             for batch in batches:
                 docs, golds = zip(*batch)
                 nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses)
             with textcat.model.use_params(optimizer.averages):
                 # evaluate on the dev data split off in load_data()
                 scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
-            print('{0:.3f}\t{0:.3f}\t{0:.3f}\t{0:.3f}'  # print a simple table
+            print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                   .format(losses['textcat'], scores['textcat_p'],
                           scores['textcat_r'], scores['textcat_f']))