From 692eb0603d5305a19407302721d4cd6790235496 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 20 Mar 2017 18:24:44 +0100
Subject: [PATCH 01/15] Fix high memory usage in download command

Due to PyPi issue #2984, installing large packages via pip causes
a large spike in memory usage. The recommended fix is to disable
caching.
---
 spacy/cli/download.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 446de4a37..56dbd5264 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -56,7 +56,8 @@ def get_version(model, comp):
 def download_model(filename):
     util.print_msg("Downloading {f}".format(f=filename))
     download_url = about.__download_url__ + '/' + filename
-    subprocess.call([sys.executable, '-m', 'pip', 'install', download_url],
+    subprocess.call([sys.executable, '-m',
+        'pip', 'install', '--no-cache-dir', download_url],
         env=os.environ.copy())
 
 

From 8bc05c2ba97dd51fa9a066def0ab82a97ca55d11 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Thu, 23 Mar 2017 11:07:59 +0100
Subject: [PATCH 02/15] Delete old training scripts (resolves #911)

---
 bin/parser/conll_parse.py | 130 -------------------
 bin/parser/nn_train.py    | 261 --------------------------------------
 2 files changed, 391 deletions(-)
 delete mode 100644 bin/parser/conll_parse.py
 delete mode 100755 bin/parser/nn_train.py

diff --git a/bin/parser/conll_parse.py b/bin/parser/conll_parse.py
deleted file mode 100644
index 85a81c432..000000000
--- a/bin/parser/conll_parse.py
+++ /dev/null
@@ -1,130 +0,0 @@
-#!/usr/bin/env python
-from __future__ import division
-from __future__ import unicode_literals
-
-import os
-from os import path
-import shutil
-import codecs
-import random
-import time
-import gzip
-
-import plac
-import cProfile
-import pstats
-
-import spacy.util
-from spacy.en import English
-from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
-
-from spacy.syntax.parser import GreedyParser
-from spacy.syntax.parser import OracleError
-from spacy.syntax.util import Config
-
-
-def is_punct_label(label):
-    return label == 'P' or label.lower() == 'punct'
-
-
-def read_gold(file_):
-    """Read a standard CoNLL/MALT-style format"""
-    sents = []
-    for sent_str in file_.read().strip().split('\n\n'):
-        ids = []
-        words = []
-        heads = []
-        labels = []
-        tags = []
-        for i, line in enumerate(sent_str.split('\n')):
-            id_, word, pos_string, head_idx, label = _parse_line(line)
-            words.append(word)
-            if head_idx == -1:
-                head_idx = i
-            ids.append(id_)
-            heads.append(head_idx)
-            labels.append(label)
-            tags.append(pos_string)
-        text = ' '.join(words)
-        sents.append((text, [words], ids, words, tags, heads, labels))
-    return sents
-
-
-def _parse_line(line):
-    pieces = line.split()
-    id_ = int(pieces[0])
-    word = pieces[1]
-    pos = pieces[3]
-    head_idx = int(pieces[6])
-    label = pieces[7]
-    return id_, word, pos, head_idx, label
-
-        
-def iter_data(paragraphs, tokenizer, gold_preproc=False):
-    for raw, tokenized, ids, words, tags, heads, labels in paragraphs:
-        assert len(words) == len(heads)
-        for words in tokenized:
-            sent_ids = ids[:len(words)]
-            sent_tags = tags[:len(words)]
-            sent_heads = heads[:len(words)]
-            sent_labels = labels[:len(words)]
-            sent_heads = _map_indices_to_tokens(sent_ids, sent_heads)
-            tokens = tokenizer.tokens_from_list(words)
-            yield tokens, sent_tags, sent_heads, sent_labels
-            ids = ids[len(words):]
-            tags = tags[len(words):]
-            heads = heads[len(words):]
-            labels = labels[len(words):]
-
-
-def _map_indices_to_tokens(ids, heads):
-    mapped = []
-    for head in heads:
-        if head not in ids:
-            mapped.append(None)
-        else:
-            mapped.append(ids.index(head))
-    return mapped
-
-
-
-def evaluate(Language, dev_loc, model_dir):
-    global loss
-    nlp = Language()
-    n_corr = 0
-    pos_corr = 0
-    n_tokens = 0
-    total = 0
-    skipped = 0
-    loss = 0
-    with codecs.open(dev_loc, 'r', 'utf8') as file_:
-        paragraphs = read_gold(file_)
-    for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer):
-        assert len(tokens) == len(labels)
-        nlp.tagger.tag_from_strings(tokens, tag_strs)
-        nlp.parser(tokens)
-        for i, token in enumerate(tokens):
-            try:
-                pos_corr += token.tag_ == tag_strs[i]
-            except:
-                print i, token.orth_, token.tag
-                raise
-            n_tokens += 1
-            if heads[i] is None:
-                skipped += 1
-                continue
-            if is_punct_label(labels[i]):
-                continue
-            n_corr += token.head.i == heads[i]
-            total += 1
-    print loss, skipped, (loss+skipped + total)
-    print pos_corr / n_tokens
-    return float(n_corr) / (total + loss)
-
-
-def main(dev_loc, model_dir):
-    print evaluate(English, dev_loc, model_dir)
-    
-
-if __name__ == '__main__':
-    plac.call(main)
diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py
deleted file mode 100755
index 72c9e04f1..000000000
--- a/bin/parser/nn_train.py
+++ /dev/null
@@ -1,261 +0,0 @@
-#!/usr/bin/env python
-from __future__ import division
-from __future__ import unicode_literals
-
-import os
-from os import path
-import shutil
-import codecs
-import random
-
-import plac
-import cProfile
-import pstats
-import re
-
-import spacy.util
-from spacy.en import English
-from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
-
-from spacy.syntax.util import Config
-from spacy.gold import read_json_file
-from spacy.gold import GoldParse
-
-from spacy.scorer import Scorer
-
-from spacy.syntax.parser import Parser, get_templates
-from spacy._theano import TheanoModel
-
-import theano
-import theano.tensor as T
-
-from theano.printing import Print
-
-import numpy
-from collections import OrderedDict, defaultdict
-
-
-theano.config.profile = False
-theano.config.floatX = 'float32'
-floatX = theano.config.floatX
-
-
-def L1(L1_reg, *weights):
-    return L1_reg * sum(abs(w).sum() for w in weights)
-
-
-def L2(L2_reg, *weights):
-    return L2_reg * sum((w ** 2).sum() for w in weights)
-
-
-def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6):
-    updates = OrderedDict()
-    for param in params:
-        value = param.get_value(borrow=True)
-        accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
-                             broadcastable=param.broadcastable)
-
-        grad = T.grad(loss, param)
-        accu_new = rho * accu + (1 - rho) * grad ** 2
-        updates[accu] = accu_new
-        updates[param] = param - (eta * grad / T.sqrt(accu_new + eps))
-    return updates
-
-
-def relu(x):
-    return x * (x > 0)
-
-
-def feed_layer(activation, weights, bias, input_):
-    return activation(T.dot(input_, weights) + bias)
-
-
-def init_weights(n_in, n_out):
-    rng = numpy.random.RandomState(1235)
-    
-    weights = numpy.asarray(
-        rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in),
-        dtype=theano.config.floatX
-    )
-    bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
-    return [wrapper(weights, name='W'), wrapper(bias, name='b')]
-
-
-def compile_model(n_classes, n_hidden, n_in, optimizer):
-    x = T.vector('x') 
-    costs = T.ivector('costs')
-    loss = T.scalar('loss')
-
-    maxent_W, maxent_b = init_weights(n_hidden, n_classes)
-    hidden_W, hidden_b = init_weights(n_in, n_hidden)
-
-    # Feed the inputs forward through the network
-    p_y_given_x = feed_layer(
-                    T.nnet.softmax,
-                    maxent_W,
-                    maxent_b,
-                      feed_layer(
-                        relu,
-                        hidden_W,
-                        hidden_b,
-                        x))
-
-    loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8)
-
-    train_model = theano.function(
-        name='train_model',
-        inputs=[x, costs],
-        outputs=[p_y_given_x[0], T.grad(loss, x), loss],
-        updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]),
-        on_unused_input='warn'
-    )
-
-    evaluate_model = theano.function(
-        name='evaluate_model',
-        inputs=[x],
-        outputs=[
-            feed_layer(
-              T.nnet.softmax,
-              maxent_W,
-              maxent_b,
-              feed_layer(
-                relu,
-                hidden_W,
-                hidden_b,
-                x
-              )
-            )[0]
-        ]
-    )
-    return train_model, evaluate_model
-
-
-def score_model(scorer, nlp, annot_tuples, verbose=False):
-    tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-    nlp.tagger(tokens)
-    nlp.parser(tokens)
-    gold = GoldParse(tokens, annot_tuples)
-    scorer.score(tokens, gold, verbose=verbose)
-
-
-def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
-          eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
-          seed=0, n_sents=0,  verbose=False):
-
-    dep_model_dir = path.join(model_dir, 'deps')
-    pos_model_dir = path.join(model_dir, 'pos')
-    if path.exists(dep_model_dir):
-        shutil.rmtree(dep_model_dir)
-    if path.exists(pos_model_dir):
-        shutil.rmtree(pos_model_dir)
-    os.mkdir(dep_model_dir)
-    os.mkdir(pos_model_dir)
-    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
-
-    Config.write(dep_model_dir, 'config',
-        seed=seed,
-        templates=tuple(),
-        labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
-        vector_lengths=(nv_word, nv_tag, nv_label),
-        hidden_nodes=nv_hidden,
-        eta=eta,
-        mu=mu
-    )
-  
-    # Bake-in hyper-parameters
-    optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps)
-    nlp = Language(data_dir=model_dir)
-    n_classes = nlp.parser.model.n_classes
-    train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer)
-    nlp.parser.model = TheanoModel(n_classes, input_spec, train,
-                                   predict, model_loc)
- 
-    if n_sents > 0:
-        gold_tuples = gold_tuples[:n_sents]
-    print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
-    log_loc = path.join(model_dir, 'job.log')
-    for itn in range(n_iter):
-        scorer = Scorer()
-        loss = 0
-        for _, sents in gold_tuples:
-            for annot_tuples, ctnt in sents:
-                if len(annot_tuples[1]) == 1:
-                    continue
-                score_model(scorer, nlp, annot_tuples)
-                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-                nlp.tagger(tokens)
-                gold = GoldParse(tokens, annot_tuples, make_projective=True)
-                assert gold.is_projective
-                loss += nlp.parser.train(tokens, gold)
-                nlp.tagger.train(tokens, gold.tags)
-        random.shuffle(gold_tuples)
-        logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
-                                                 scorer.tags_acc,
-                                                 scorer.token_acc)
-        print logline
-        with open(log_loc, 'aw') as file_:
-            file_.write(logline + '\n')
-    nlp.parser.model.end_training()
-    nlp.tagger.model.end_training()
-    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
-    return nlp
-
-
-def evaluate(nlp, gold_tuples, gold_preproc=True):
-    scorer = Scorer()
-    for raw_text, sents in gold_tuples:
-        for annot_tuples, brackets in sents:
-            tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-            nlp.tagger(tokens)
-            nlp.parser(tokens)
-            gold = GoldParse(tokens, annot_tuples)
-            scorer.score(tokens, gold)
-    return scorer
-
-
-@plac.annotations(
-    train_loc=("Location of training file or directory"),
-    dev_loc=("Location of development file or directory"),
-    model_dir=("Location of output model directory",),
-    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
-    n_sents=("Number of training sentences", "option", "n", int),
-    n_iter=("Number of training iterations", "option", "i", int),
-    verbose=("Verbose error reporting", "flag", "v", bool),
-
-    nv_word=("Word vector length", "option", "W", int),
-    nv_tag=("Tag vector length", "option", "T", int),
-    nv_label=("Label vector length", "option", "L", int),
-    nv_hidden=("Hidden nodes length", "option", "H", int),
-    eta=("Learning rate", "option", "E", float),
-    mu=("Momentum", "option", "M", float),
-)
-def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False,
-         nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10,
-         eta=0.1, mu=0.9, eval_only=False):
-
-
-
-
-    gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id']))
-
-    nlp = train(English, gold_train, model_dir,
-               feat_set='embed',
-               eta=eta, mu=mu,
-               nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden,
-               n_sents=n_sents, n_iter=n_iter,
-               verbose=verbose)
-
-    scorer = evaluate(nlp, list(read_json_file(dev_loc)))
-    
-    print 'TOK', 100-scorer.token_acc
-    print 'POS', scorer.tags_acc
-    print 'UAS', scorer.uas
-    print 'LAS', scorer.las
-
-    print 'NER P', scorer.ents_p
-    print 'NER R', scorer.ents_r
-    print 'NER F', scorer.ents_f
-
-
-if __name__ == '__main__':
-    plac.call(main)

From d137b26c3290dc7adf3a7044af3388f54de063b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?=
 <raphael0202@users.noreply.github.com>
Date: Thu, 23 Mar 2017 21:28:57 +0100
Subject: [PATCH 03/15] Fix doc typo

---
 website/docs/usage/training.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade
index 6963730ab..6a71ba68a 100644
--- a/website/docs/usage/training.jade
+++ b/website/docs/usage/training.jade
@@ -79,7 +79,7 @@ p
     |  conjunction features out of the atomic predictors. Let's say you have
     |  two atomic predictors asking, "What is the part-of-speech of the
     |  previous token?", and "What is the part-of-speech of the previous
-    |  previous token?". These ppredictors will introduce a number of features,
+    |  previous token?". These predictors will introduce a number of features,
     |  e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction
     |  template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ].
 

From f40fbc3710edc2a19199cc3d01403f129dad6965 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 23 Mar 2017 23:38:57 +0100
Subject: [PATCH 04/15] Add test for Issue #910: Resuming entity training

---
 spacy/tests/regression/test_issue910.py | 113 ++++++++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue910.py

diff --git a/spacy/tests/regression/test_issue910.py b/spacy/tests/regression/test_issue910.py
new file mode 100644
index 000000000..9b2b2287b
--- /dev/null
+++ b/spacy/tests/regression/test_issue910.py
@@ -0,0 +1,113 @@
+from __future__ import unicode_literals
+import json
+import os
+import random
+import contextlib
+import shutil
+import pytest
+import tempfile
+from pathlib import Path
+
+
+import pathlib
+from ...gold import GoldParse
+from ...pipeline import EntityRecognizer
+from ...en import English
+
+try:
+    unicode
+except NameError:
+    unicode = str
+
+
+@pytest.fixture
+def train_data():
+    return [
+            ["hey",[]],
+            ["howdy",[]],
+            ["hey there",[]],
+            ["hello",[]],
+            ["hi",[]],
+            ["i'm looking for a place to eat",[]],
+            ["i'm looking for a place in the north of town",[[31,36,"location"]]],
+            ["show me chinese restaurants",[[8,15,"cuisine"]]],
+            ["show me chines restaurants",[[8,14,"cuisine"]]],
+            ["yes",[]],
+            ["yep",[]],
+            ["yeah",[]],
+            ["show me a mexican place in the centre",[[31,37,"location"], [10,17,"cuisine"]]],
+            ["bye",[]],["goodbye",[]],
+            ["good bye",[]],
+            ["stop",[]],
+            ["end",[]],
+            ["i am looking for an indian spot",[[20,26,"cuisine"]]],
+            ["search for restaurants",[]],
+            ["anywhere in the west",[[16,20,"location"]]],
+            ["central indian restaurant",[[0,7,"location"],[8,14,"cuisine"]]],
+            ["indeed",[]],
+            ["that's right",[]],
+            ["ok",[]],
+            ["great",[]]
+    ]
+
+@pytest.fixture
+def additional_entity_types():
+    return ['cuisine', 'location']
+
+
+@contextlib.contextmanager
+def temp_save_model(model):
+    model_dir = Path(tempfile.mkdtemp())
+    # store the fine tuned model
+    with (model_dir / "config.json").open('w') as file_:
+        data = json.dumps(model.cfg)
+        if not isinstance(data, unicode):
+            data = data.decode('utf8')
+        file_.write(data)
+    model.model.dump((model_dir / 'model').as_posix())
+    yield model_dir
+    shutil.rmtree(model_dir.as_posix())
+
+
+
+@pytest.mark.xfail
+@pytest.mark.models
+def test_issue910(train_data, additional_entity_types):
+    '''Test that adding entities and resuming training works passably OK.
+    There are two issues here:
+
+    1) We have to readd labels. This isn't very nice.
+    2) There's no way to set the learning rate for the weight update, so we
+        end up out-of-scale, causing it to learn too fast.
+    '''
+    nlp = English()
+    doc = nlp(u"I am looking for a restaurant in Berlin")
+    ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
+    # Fine tune the ner model
+    for entity_type in additional_entity_types:
+        if entity_type not in nlp.entity.cfg['actions']['1']:
+            nlp.entity.add_label(entity_type)
+
+    nlp.entity.learn_rate = 0.001
+    for itn in range(4):
+        random.shuffle(train_data)
+        for raw_text, entity_offsets in train_data:
+            doc = nlp.make_doc(raw_text)
+            nlp.tagger(doc)
+            gold = GoldParse(doc, entities=entity_offsets)
+            loss = nlp.entity.update(doc, gold)
+
+    with temp_save_model(nlp.entity) as model_dir:
+        # Load the fine tuned model
+        loaded_ner = EntityRecognizer.load(model_dir, nlp.vocab)
+
+    for entity_type in additional_entity_types:
+        if entity_type not in loaded_ner.cfg['actions']['1']:
+            loaded_ner.add_label(entity_type)
+
+    doc = nlp(u"I am looking for a restaurant in Berlin", entity=False)
+    nlp.tagger(doc)
+    loaded_ner(doc)
+
+    ents_after_train = [(ent.label_, ent.text) for ent in doc.ents]
+    assert ents_before_train == ents_after_train

From da135bd823dcd57520c2032ccf536adb35e9c087 Mon Sep 17 00:00:00 2001
From: Iddo Berger <iddoberger@gmail.com>
Date: Fri, 24 Mar 2017 18:27:44 +0300
Subject: [PATCH 05/15] add hebrew tokenizer

---
 spacy/he/__init__.py             |  18 +++
 spacy/he/language_data.py        |  17 +++
 spacy/he/stop_words.py           | 226 +++++++++++++++++++++++++++++++
 spacy/tests/conftest.py          |   6 +
 spacy/tests/he/__init__.py       |   0
 spacy/tests/he/test_tokenizer.py |  17 +++
 6 files changed, 284 insertions(+)
 create mode 100644 spacy/he/__init__.py
 create mode 100644 spacy/he/language_data.py
 create mode 100644 spacy/he/stop_words.py
 create mode 100644 spacy/tests/he/__init__.py
 create mode 100644 spacy/tests/he/test_tokenizer.py

diff --git a/spacy/he/__init__.py b/spacy/he/__init__.py
new file mode 100644
index 000000000..a3e86ed73
--- /dev/null
+++ b/spacy/he/__init__.py
@@ -0,0 +1,18 @@
+# encoding: utf8
+from __future__ import unicode_literals, print_function
+
+from ..language import Language
+from ..attrs import LANG
+
+from .language_data import *
+
+
+class Hebrew(Language):
+    lang = 'he'
+
+    class Defaults(Language.Defaults):
+        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+        lex_attr_getters[LANG] = lambda text: 'he'
+
+        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+        stop_words = STOP_WORDS
diff --git a/spacy/he/language_data.py b/spacy/he/language_data.py
new file mode 100644
index 000000000..a4a657c33
--- /dev/null
+++ b/spacy/he/language_data.py
@@ -0,0 +1,17 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from .. import language_data as base
+from ..language_data import update_exc, strings_to_exc
+
+from .stop_words import STOP_WORDS
+
+
+STOP_WORDS = set(STOP_WORDS)
+
+
+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
+
+__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
diff --git a/spacy/he/stop_words.py b/spacy/he/stop_words.py
new file mode 100644
index 000000000..2914fa0d5
--- /dev/null
+++ b/spacy/he/stop_words.py
@@ -0,0 +1,226 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+STOP_WORDS = set("""
+אני
+את
+אתה
+אנחנו
+אתן
+אתם
+הם
+הן
+היא
+הוא
+שלי
+שלו
+שלך
+שלה
+שלנו
+שלכם
+שלכן
+שלהם
+שלהן
+לי
+לו
+לה
+לנו
+לכם
+לכן
+להם
+להן
+אותה
+אותו
+זה
+זאת
+אלה
+אלו
+תחת
+מתחת
+מעל
+בין
+עם
+עד
+נגר
+על
+אל
+מול
+של
+אצל
+כמו
+אחר
+אותו
+בלי
+לפני
+אחרי
+מאחורי
+עלי
+עליו
+עליה
+עליך
+עלינו
+עליכם
+לעיכן
+עליהם
+עליהן
+כל
+כולם
+כולן
+כך
+ככה
+כזה
+זה
+זות
+אותי
+אותה
+אותם
+אותך
+אותו
+אותן
+אותנו
+ואת
+את
+אתכם
+אתכן
+איתי
+איתו
+איתך
+איתה
+איתם
+איתן
+איתנו
+איתכם
+איתכן
+יהיה
+תהיה
+היתי
+היתה
+היה
+להיות
+עצמי
+עצמו
+עצמה
+עצמם
+עצמן
+עצמנו
+עצמהם
+עצמהן
+מי
+מה
+איפה
+היכן
+במקום שבו
+אם
+לאן
+למקום שבו
+מקום בו
+איזה
+מהיכן
+איך
+כיצד
+באיזו מידה
+מתי
+בשעה ש
+כאשר
+כש
+למרות
+לפני
+אחרי
+מאיזו סיבה
+הסיבה שבגללה
+למה
+מדוע
+לאיזו תכלית
+כי
+יש
+אין
+אך
+מנין
+מאין
+מאיפה
+יכל
+יכלה
+יכלו
+יכול
+יכולה
+יכולים
+יכולות
+יוכלו
+יוכל
+מסוגל
+לא
+רק
+אולי
+אין
+לאו
+אי
+כלל
+נגד
+אם
+עם
+אל
+אלה
+אלו
+אף
+על
+מעל
+מתחת
+מצד
+בשביל
+לבין
+באמצע
+בתוך
+דרך
+מבעד
+באמצעות
+למעלה
+למטה
+מחוץ
+מן
+לעבר
+מכאן
+כאן
+הנה
+הרי
+פה
+שם
+אך
+ברם
+שוב
+אבל
+מבלי
+בלי
+מלבד
+רק
+בגלל
+מכיוון
+עד
+אשר
+ואילו
+למרות
+אס
+כמו
+כפי
+אז
+אחרי
+כן
+לכן
+לפיכך
+מאד
+עז
+מעט
+מעטים
+במידה
+שוב
+יותר
+מדי
+גם
+כן
+נו
+אחר
+אחרת
+אחרים
+אחרות
+אשר
+או
+""".split())
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index b6dcb905a..f049d2f91 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -11,6 +11,7 @@ from ..nl import Dutch
 from ..sv import Swedish
 from ..hu import Hungarian
 from ..fi import Finnish
+from ..he import Hebrew
 from ..tokens import Doc
 from ..strings import StringStore
 from ..lemmatizer import Lemmatizer
@@ -73,6 +74,11 @@ def sv_tokenizer():
     return Swedish.Defaults.create_tokenizer()
 
 
+@pytest.fixture
+def he_tokenizer():
+    return Hebrew.Defaults.create_tokenizer()
+
+
 @pytest.fixture
 def stringstore():
     return StringStore()
diff --git a/spacy/tests/he/__init__.py b/spacy/tests/he/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/he/test_tokenizer.py b/spacy/tests/he/test_tokenizer.py
new file mode 100644
index 000000000..a6c65805a
--- /dev/null
+++ b/spacy/tests/he/test_tokenizer.py
@@ -0,0 +1,17 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+ABBREVIATION_TESTS = [
+    ('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית'])
+]
+
+TESTCASES = ABBREVIATION_TESTS
+
+
+@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
+def test_tokenizer_handles_testcases(he_tokenizer, text, expected_tokens):
+    tokens = he_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
\ No newline at end of file

From 8a0ee5fc81a0a3110557550801ae5db9c6c5fe6f Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 25 Mar 2017 10:18:38 +0100
Subject: [PATCH 06/15] Fix download commands

---
 website/docs/api/language-models.jade | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade
index aad86f614..d93e617dd 100644
--- a/website/docs/api/language-models.jade
+++ b/website/docs/api/language-models.jade
@@ -5,8 +5,8 @@ include ../../_includes/_mixins
 p spaCy currently supports the following languages and capabilities:
 
 +aside-code("Download language models", "bash").
-    python -m spacy.en.download all
-    python -m spacy.de.download all
+    python -m spacy download en
+    python -m spacy download de
 
 +table([ "Language", "Token", "SBD", "Lemma", "POS", "NER", "Dep", "Vector", "Sentiment"])
     +row

From b7f714b4988a267bed384badc79fd8272e5fb55c Mon Sep 17 00:00:00 2001
From: Greg Baker <gregb@ifost.org.au>
Date: Sat, 25 Mar 2017 21:36:38 +1100
Subject: [PATCH 07/15] Possible solution to #909

---
 spacy/cli/link.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/link.py b/spacy/cli/link.py
index 0f28187b3..833b9aff2 100644
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@@ -5,6 +5,7 @@ import pip
 from pathlib import Path
 import importlib
 from .. import util
+import sys
 
 
 def link(origin, link_name, force=False):
@@ -43,7 +44,13 @@ def symlink(model_path, link_name, force):
     elif link_path.exists():
         link_path.unlink()
 
-    link_path.symlink_to(model_path)
+    if sys.version.startswith('2.') and sys.platform.startswith('win'):
+        import subprocess
+        subprocess.call(['mklink','/d',str(link_path), str(model_path)], 
+                        shell=True)
+    else:
+        link_path.symlink_to(model_path)
+
     util.print_msg(
         "{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),
         "You can now load the model via spacy.load('{l}').".format(l=link_name),

From fdec758113e3e0782241d7d1a94d4a0c8a611215 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 25 Mar 2017 14:04:02 +0100
Subject: [PATCH 08/15] Add is_windows and is_python2 utility functions

---
 spacy/util.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/spacy/util.py b/spacy/util.py
index 893ba87c1..6c25ce0e8 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -153,6 +153,16 @@ def check_renamed_kwargs(renamed, kwargs):
             raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
 
 
+def is_windows():
+    """Check if user is on Windows."""
+    return sys.platform.startswith('win')
+
+
+def is_python2():
+    """Check if Python 2 is used."""
+    return sys.version.startswith('2.')
+
+
 def parse_package_meta(package_path, package, require=True):
     location = os.path.join(str(package_path), package, 'meta.json')
     if os.path.isfile(location):

From 97814f8da6b5d7e5da12a8d0ac62ce441730e479 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 25 Mar 2017 14:04:27 +0100
Subject: [PATCH 09/15] Update Windows Python 2 link workaround to use helper
 functions

---
 spacy/cli/link.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/cli/link.py b/spacy/cli/link.py
index 833b9aff2..3777fd85f 100644
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@@ -5,7 +5,6 @@ import pip
 from pathlib import Path
 import importlib
 from .. import util
-import sys
 
 
 def link(origin, link_name, force=False):
@@ -44,10 +43,11 @@ def symlink(model_path, link_name, force):
     elif link_path.exists():
         link_path.unlink()
 
-    if sys.version.startswith('2.') and sys.platform.startswith('win'):
+    # Add workaround for Python 2 on Windows (see issue #909)
+    if util.is_python2() and util.is_windows():
         import subprocess
-        subprocess.call(['mklink','/d',str(link_path), str(model_path)], 
-                        shell=True)
+        command = ['mklink', '/d', link_path.as_posix(), model_path.as_posix()]
+        subprocess.call(command, shell=True)
     else:
         link_path.symlink_to(model_path)
 

From 4454c1b23fc3793257ef20174389337f48be596a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 25 Mar 2017 21:29:57 +0100
Subject: [PATCH 10/15] Block lemmatization of base-form adjectives

Fixes check that an adjective is a base form (as opposed to a
comparative or superlative), so that it's not lemmatized.
e.g. inner -!> inn. Closes #912.
---
 spacy/lemmatizer.py                     |  8 +++++++-
 spacy/tests/regression/test_issue912.py | 14 ++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 spacy/tests/regression/test_issue912.py

diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index 78ff43039..434c49e91 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -7,6 +7,8 @@ import ujson as json
 from .en.lemmatizer import INDEX, EXC, RULES
 from .symbols import POS, NOUN, VERB, ADJ, PUNCT
 from .symbols import VerbForm_inf, VerbForm_none
+from .symbols import Number_sing
+from .symbols import Degree_pos
 
 
 class Lemmatizer(object):
@@ -42,6 +44,7 @@ class Lemmatizer(object):
     def is_base_form(self, univ_pos, morphology=None):
         '''Check whether we're dealing with an uninflected paradigm, so we can
         avoid lemmatization entirely.'''
+        print("Is base form?", univ_pos, morphology)
         morphology = {} if morphology is None else morphology
         others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
         true_morph_key = morphology.get('morph', 0)
@@ -49,7 +52,10 @@ class Lemmatizer(object):
             return True
         elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
             return True
-        elif true_morph_key in (VerbForm_inf, VerbForm_none):
+        elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
+            return True
+        elif true_morph_key in \
+            (VerbForm_inf, VerbForm_none, Number_sing, Degree_pos):
             return True
         else:
             return False
diff --git a/spacy/tests/regression/test_issue912.py b/spacy/tests/regression/test_issue912.py
new file mode 100644
index 000000000..791e2e152
--- /dev/null
+++ b/spacy/tests/regression/test_issue912.py
@@ -0,0 +1,14 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+from ...tokens import Doc
+
+
+@pytest.mark.parametrize('text,tag,lemma', [("inner", "JJ", "inner")])
+def test_issue912(en_vocab, text, tag, lemma):
+    '''Test base-forms of adjectives are preserved.'''
+    doc = Doc(en_vocab, words=[text])
+    doc[0].tag_ = tag
+    assert doc[0].lemma_ == lemma
+

From 850d35dcb31f040e95d5329a4b6ef6c30ab6536e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 25 Mar 2017 21:49:10 +0100
Subject: [PATCH 11/15] Make morphology use int attributes internally

The morphology class was calling the lemmatizer inconsistently,
which some string-valued attributes. This caused Issue #903.
---
 spacy/morphology.pyx | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index e98ef1e92..5b22dd28e 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -32,12 +32,11 @@ def _normalize_props(props):
     return out
 
 
-
 cdef class Morphology:
     def __init__(self, StringStore string_store, tag_map, lemmatizer):
         self.mem = Pool()
         self.strings = string_store
-        self.tag_map = tag_map
+        self.tag_map = {}
         self.lemmatizer = lemmatizer
         self.n_tags = len(tag_map) + 1
         self.tag_names = tuple(sorted(tag_map.keys()))
@@ -52,6 +51,7 @@ cdef class Morphology:
             self.rich_tags[i].morph = 0
             self.rich_tags[i].pos = attrs[POS]
             self.reverse_index[self.rich_tags[i].name] = i
+            self.tag_map[tag_str] = attrs
         self._cache = PreshMapArray(self.n_tags)
 
     def __reduce__(self):
@@ -74,10 +74,10 @@ cdef class Morphology:
         # Related to Issue #220
         if Lexeme.c_check_flag(token.lex, IS_SPACE):
             tag_id = self.reverse_index[self.strings['SP']]
+        rich_tag = self.rich_tags[tag_id]
         analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
         if analysis is NULL:
             analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
-            analysis.tag = self.rich_tags[tag_id]
             tag_str = self.strings[self.rich_tags[tag_id].name]
             analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
                                             self.tag_map.get(tag_str, {}))
@@ -126,8 +126,7 @@ cdef class Morphology:
             else:
                 self.assign_feature(&cached.tag.morph, name_id, value_id)
         if cached.lemma == 0:
-            cached.lemma = self.lemmatize(rich_tag.pos, orth,
-                                          self.tag_map.get(tag_str, {}))
+            cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs)
         self._cache.set(tag_id, orth, <void*>cached)
 
     def load_morph_exceptions(self, dict exc):

From 4f400fa486ebf4fa7ef5aa90607cca68acb301a8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 25 Mar 2017 21:51:12 +0100
Subject: [PATCH 12/15] Prevent lemmatization of base nouns

Update lemmatizer's base-form check, for change in morphology class.
Closes #903.
---
 spacy/lemmatizer.py                     |  8 +++-----
 spacy/tests/regression/test_issue903.py | 16 ++++++++++++++++
 2 files changed, 19 insertions(+), 5 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue903.py

diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index 434c49e91..d10b40d7b 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -44,18 +44,16 @@ class Lemmatizer(object):
     def is_base_form(self, univ_pos, morphology=None):
         '''Check whether we're dealing with an uninflected paradigm, so we can
         avoid lemmatization entirely.'''
-        print("Is base form?", univ_pos, morphology)
         morphology = {} if morphology is None else morphology
         others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
         true_morph_key = morphology.get('morph', 0)
-        if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
+        if univ_pos == 'noun' and morphology.get('Number') == 'sing' and not others:
             return True
-        elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
+        elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf' and not others:
             return True
         elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
             return True
-        elif true_morph_key in \
-            (VerbForm_inf, VerbForm_none, Number_sing, Degree_pos):
+        elif true_morph_key in (VerbForm_inf, VerbForm_none, Number_sing, Degree_pos):
             return True
         else:
             return False
diff --git a/spacy/tests/regression/test_issue903.py b/spacy/tests/regression/test_issue903.py
new file mode 100644
index 000000000..36acd2dfc
--- /dev/null
+++ b/spacy/tests/regression/test_issue903.py
@@ -0,0 +1,16 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+from ...tokens import Doc
+
+
+@pytest.mark.parametrize('text,tag,lemma',
+    [("anus", "NN", "anus"),
+     ("princess", "NN", "princess")])
+def test_issue912(en_vocab, text, tag, lemma):
+    '''Test base-forms of adjectives are preserved.'''
+    doc = Doc(en_vocab, words=[text])
+    doc[0].tag_ = tag
+    assert doc[0].lemma_ == lemma
+

From c748907a667c1ba0f9b2576edaf2a92951c197cb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 25 Mar 2017 21:56:41 +0100
Subject: [PATCH 13/15] Fix errors in previous commit

---
 spacy/lemmatizer.py  | 13 ++++++++++---
 spacy/morphology.pyx |  5 ++++-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index d10b40d7b..5cd4842a9 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -47,13 +47,20 @@ class Lemmatizer(object):
         morphology = {} if morphology is None else morphology
         others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
         true_morph_key = morphology.get('morph', 0)
-        if univ_pos == 'noun' and morphology.get('Number') == 'sing' and not others:
+        print(univ_pos, morphology)
+        if univ_pos == 'noun' and morphology.get('Number') == 'sing':
             return True
-        elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf' and not others:
+        elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
             return True
         elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
             return True
-        elif true_morph_key in (VerbForm_inf, VerbForm_none, Number_sing, Degree_pos):
+        elif VerbForm_inf in morphology:
+            return True
+        elif VerbForm_none in morphology:
+            return True
+        elif Number_sing in morphology:
+            return True
+        elif Degree_pos in morphology:
             return True
         else:
             return False
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 5b22dd28e..372bbb5ce 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -25,6 +25,8 @@ def _normalize_props(props):
             if value in POS_IDS:
                 value = POS_IDS[value]
             out[key] = value
+        elif isinstance(key, int):
+            out[key] = value
         elif key.lower() == 'pos':
             out[POS] = POS_IDS[value.upper()]
         else:
@@ -45,13 +47,13 @@ cdef class Morphology:
         self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
         for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
             attrs = _normalize_props(attrs)
+            self.tag_map[tag_str] = dict(attrs)
             attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
             self.rich_tags[i].id = i
             self.rich_tags[i].name = self.strings[tag_str]
             self.rich_tags[i].morph = 0
             self.rich_tags[i].pos = attrs[POS]
             self.reverse_index[self.rich_tags[i].name] = i
-            self.tag_map[tag_str] = attrs
         self._cache = PreshMapArray(self.n_tags)
 
     def __reduce__(self):
@@ -79,6 +81,7 @@ cdef class Morphology:
         if analysis is NULL:
             analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
             tag_str = self.strings[self.rich_tags[tag_id].name]
+            analysis.tag = rich_tag
             analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
                                             self.tag_map.get(tag_str, {}))
             self._cache.set(tag_id, token.lex.orth, analysis)

From b94286de30fdd154cfcd1c88889819c047693f7a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 25 Mar 2017 22:35:07 +0100
Subject: [PATCH 14/15] Fix regression test

---
 spacy/tests/regression/test_issue595.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/regression/test_issue595.py b/spacy/tests/regression/test_issue595.py
index 6c73a621a..4a83a4020 100644
--- a/spacy/tests/regression/test_issue595.py
+++ b/spacy/tests/regression/test_issue595.py
@@ -12,7 +12,7 @@ import pytest
 def test_issue595():
     """Test lemmatization of base forms"""
     words = ["Do", "n't", "feed", "the", "dog"]
-    tag_map = {'VB': {POS: VERB, 'morph': VerbForm_inf}}
+    tag_map = {'VB': {POS: VERB, VerbForm_inf: True}}
     rules = {"verb": [["ed", "e"]]}
 
     lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules)

From 2f63806ddb336800f26c04e29a7ecdc4d12be13f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 25 Mar 2017 22:35:44 +0100
Subject: [PATCH 15/15] Update config when adding label. Re #910

---
 spacy/pipeline.pyx | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index b2d622329..ea8221cff 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -19,6 +19,9 @@ cdef class EntityRecognizer(Parser):
     def add_label(self, label):
         for action in self.moves.action_types:
             self.moves.add_action(action, label)
+            if 'actions' in self.cfg:
+                self.cfg['actions'].setdefault(action,
+                                        {}).setdefault(label, True)
         if isinstance(label, basestring):
             label = self.vocab.strings[label]
         for attr, freqs in self.vocab.serializer_freqs:
@@ -37,6 +40,9 @@ cdef class BeamEntityRecognizer(BeamParser):
     def add_label(self, label):
         for action in self.moves.action_types:
             self.moves.add_action(action, label)
+            if 'actions' in self.cfg:
+                self.cfg['actions'].setdefault(action,
+                                        {}).setdefault(label, True)
         if isinstance(label, basestring):
             label = self.vocab.strings[label]
         for attr, freqs in self.vocab.serializer_freqs:
@@ -54,6 +60,9 @@ cdef class DependencyParser(Parser):
     def add_label(self, label):
         for action in self.moves.action_types:
             self.moves.add_action(action, label)
+            if 'actions' in self.cfg:
+                self.cfg['actions'].setdefault(action,
+                                        {}).setdefault(label, True)
         if isinstance(label, basestring):
             label = self.vocab.strings[label]
         for attr, freqs in self.vocab.serializer_freqs:
@@ -71,6 +80,9 @@ cdef class BeamDependencyParser(BeamParser):
     def add_label(self, label):
         for action in self.moves.action_types:
             self.moves.add_action(action, label)
+            if 'actions' in self.cfg:
+                self.cfg['actions'].setdefault(action,
+                                        {}).setdefault(label, True)
         if isinstance(label, basestring):
             label = self.vocab.strings[label]
         for attr, freqs in self.vocab.serializer_freqs: