Merge branch 'master' into develop

2025-11-07 03:17:37 +03:00 · 2017-03-26 04:45:43 -05:00 · 2017-03-26 04:45:43 -05:00 · 5eac089fbe
commit 5eac089fbe
parent a5fc5fb0db 2f63806ddb
19 changed files with 483 additions and 403 deletions
--- a/bin/parser/conll_parse.py
+++ b/bin/parser/conll_parse.py
@ -1,130 +0,0 @@
 #!/usr/bin/env python
 from __future__ import division
 from __future__ import unicode_literals
 import os
 from os import path
 import shutil
 import codecs
 import random
 import time
 import gzip
 import plac
 import cProfile
 import pstats
 import spacy.util
 from spacy.en import English
 from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
 from spacy.syntax.parser import GreedyParser
 from spacy.syntax.parser import OracleError
 from spacy.syntax.util import Config
 def is_punct_label(label):
    return label == 'P' or label.lower() == 'punct'
 def read_gold(file_):
    """Read a standard CoNLL/MALT-style format"""
    sents = []
    for sent_str in file_.read().strip().split('\n\n'):
        ids = []
        words = []
        heads = []
        labels = []
        tags = []
        for i, line in enumerate(sent_str.split('\n')):
            id_, word, pos_string, head_idx, label = _parse_line(line)
            words.append(word)
            if head_idx == -1:
                head_idx = i
            ids.append(id_)
            heads.append(head_idx)
            labels.append(label)
            tags.append(pos_string)
        text = ' '.join(words)
        sents.append((text, [words], ids, words, tags, heads, labels))
    return sents
 def _parse_line(line):
    pieces = line.split()
    id_ = int(pieces[0])
    word = pieces[1]
    pos = pieces[3]
    head_idx = int(pieces[6])
    label = pieces[7]
    return id_, word, pos, head_idx, label
 def iter_data(paragraphs, tokenizer, gold_preproc=False):
    for raw, tokenized, ids, words, tags, heads, labels in paragraphs:
        assert len(words) == len(heads)
        for words in tokenized:
            sent_ids = ids[:len(words)]
            sent_tags = tags[:len(words)]
            sent_heads = heads[:len(words)]
            sent_labels = labels[:len(words)]
            sent_heads = _map_indices_to_tokens(sent_ids, sent_heads)
            tokens = tokenizer.tokens_from_list(words)
            yield tokens, sent_tags, sent_heads, sent_labels
            ids = ids[len(words):]
            tags = tags[len(words):]
            heads = heads[len(words):]
            labels = labels[len(words):]
 def _map_indices_to_tokens(ids, heads):
    mapped = []
    for head in heads:
        if head not in ids:
            mapped.append(None)
        else:
            mapped.append(ids.index(head))
    return mapped
 def evaluate(Language, dev_loc, model_dir):
    global loss
    nlp = Language()
    n_corr = 0
    pos_corr = 0
    n_tokens = 0
    total = 0
    skipped = 0
    loss = 0
    with codecs.open(dev_loc, 'r', 'utf8') as file_:
        paragraphs = read_gold(file_)
    for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer):
        assert len(tokens) == len(labels)
        nlp.tagger.tag_from_strings(tokens, tag_strs)
        nlp.parser(tokens)
        for i, token in enumerate(tokens):
            try:
                pos_corr += token.tag_ == tag_strs[i]
            except:
                print i, token.orth_, token.tag
                raise
            n_tokens += 1
            if heads[i] is None:
                skipped += 1
                continue
            if is_punct_label(labels[i]):
                continue
            n_corr += token.head.i == heads[i]
            total += 1
    print loss, skipped, (loss+skipped + total)
    print pos_corr / n_tokens
    return float(n_corr) / (total + loss)
 def main(dev_loc, model_dir):
    print evaluate(English, dev_loc, model_dir)
 if __name__ == '__main__':
    plac.call(main)
--- a/bin/parser/nn_train.py
+++ b/bin/parser/nn_train.py
@ -1,261 +0,0 @@
 #!/usr/bin/env python
 from __future__ import division
 from __future__ import unicode_literals
 import os
 from os import path
 import shutil
 import codecs
 import random
 import plac
 import cProfile
 import pstats
 import re
 import spacy.util
 from spacy.en import English
 from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
 from spacy.syntax.util import Config
 from spacy.gold import read_json_file
 from spacy.gold import GoldParse
 from spacy.scorer import Scorer
 from spacy.syntax.parser import Parser, get_templates
 from spacy._theano import TheanoModel
 import theano
 import theano.tensor as T
 from theano.printing import Print
 import numpy
 from collections import OrderedDict, defaultdict
 theano.config.profile = False
 theano.config.floatX = 'float32'
 floatX = theano.config.floatX
 def L1(L1_reg, *weights):
    return L1_reg * sum(abs(w).sum() for w in weights)
 def L2(L2_reg, *weights):
    return L2_reg * sum((w ** 2).sum() for w in weights)
 def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6):
    updates = OrderedDict()
    for param in params:
        value = param.get_value(borrow=True)
        accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                             broadcastable=param.broadcastable)
        grad = T.grad(loss, param)
        accu_new = rho * accu + (1 - rho) * grad ** 2
        updates[accu] = accu_new
        updates[param] = param - (eta * grad / T.sqrt(accu_new + eps))
    return updates
 def relu(x):
    return x * (x > 0)
 def feed_layer(activation, weights, bias, input_):
    return activation(T.dot(input_, weights) + bias)
 def init_weights(n_in, n_out):
    rng = numpy.random.RandomState(1235)
    weights = numpy.asarray(
        rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in),
        dtype=theano.config.floatX
    )
    bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
    return [wrapper(weights, name='W'), wrapper(bias, name='b')]
 def compile_model(n_classes, n_hidden, n_in, optimizer):
    x = T.vector('x') 
    costs = T.ivector('costs')
    loss = T.scalar('loss')
    maxent_W, maxent_b = init_weights(n_hidden, n_classes)
    hidden_W, hidden_b = init_weights(n_in, n_hidden)
    # Feed the inputs forward through the network
    p_y_given_x = feed_layer(
                    T.nnet.softmax,
                    maxent_W,
                    maxent_b,
                      feed_layer(
                        relu,
                        hidden_W,
                        hidden_b,
                        x))
    loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8)
    train_model = theano.function(
        name='train_model',
        inputs=[x, costs],
        outputs=[p_y_given_x[0], T.grad(loss, x), loss],
        updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]),
        on_unused_input='warn'
    )
    evaluate_model = theano.function(
        name='evaluate_model',
        inputs=[x],
        outputs=[
            feed_layer(
              T.nnet.softmax,
              maxent_W,
              maxent_b,
              feed_layer(
                relu,
                hidden_W,
                hidden_b,
                x
              )
            )[0]
        ]
    )
    return train_model, evaluate_model
 def score_model(scorer, nlp, annot_tuples, verbose=False):
    tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
    nlp.tagger(tokens)
    nlp.parser(tokens)
    gold = GoldParse(tokens, annot_tuples)
    scorer.score(tokens, gold, verbose=verbose)
 def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
          eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
          seed=0, n_sents=0,  verbose=False):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
    Config.write(dep_model_dir, 'config',
        seed=seed,
        templates=tuple(),
        labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
        vector_lengths=(nv_word, nv_tag, nv_label),
        hidden_nodes=nv_hidden,
        eta=eta,
        mu=mu
    )
    # Bake-in hyper-parameters
    optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps)
    nlp = Language(data_dir=model_dir)
    n_classes = nlp.parser.model.n_classes
    train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer)
    nlp.parser.model = TheanoModel(n_classes, input_spec, train,
                                   predict, model_loc)
    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]
    print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
    log_loc = path.join(model_dir, 'job.log')
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for _, sents in gold_tuples:
            for annot_tuples, ctnt in sents:
                if len(annot_tuples[1]) == 1:
                    continue
                score_model(scorer, nlp, annot_tuples)
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples, make_projective=True)
                assert gold.is_projective
                loss += nlp.parser.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
                                                 scorer.tags_acc,
                                                 scorer.token_acc)
        print logline
        with open(log_loc, 'aw') as file_:
            file_.write(logline + '\n')
    nlp.parser.model.end_training()
    nlp.tagger.model.end_training()
    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
    return nlp
 def evaluate(nlp, gold_tuples, gold_preproc=True):
    scorer = Scorer()
    for raw_text, sents in gold_tuples:
        for annot_tuples, brackets in sents:
            tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
            nlp.tagger(tokens)
            nlp.parser(tokens)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold)
    return scorer
@plac.annotations(
    train_loc=("Location of training file or directory"),
    dev_loc=("Location of development file or directory"),
    model_dir=("Location of output model directory",),
    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
    n_sents=("Number of training sentences", "option", "n", int),
    n_iter=("Number of training iterations", "option", "i", int),
    verbose=("Verbose error reporting", "flag", "v", bool),
    nv_word=("Word vector length", "option", "W", int),
    nv_tag=("Tag vector length", "option", "T", int),
    nv_label=("Label vector length", "option", "L", int),
    nv_hidden=("Hidden nodes length", "option", "H", int),
    eta=("Learning rate", "option", "E", float),
    mu=("Momentum", "option", "M", float),
 )
 def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False,
         nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10,
         eta=0.1, mu=0.9, eval_only=False):
    gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id']))
    nlp = train(English, gold_train, model_dir,
               feat_set='embed',
               eta=eta, mu=mu,
               nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden,
               n_sents=n_sents, n_iter=n_iter,
               verbose=verbose)
    scorer = evaluate(nlp, list(read_json_file(dev_loc)))
    print 'TOK', 100-scorer.token_acc
    print 'POS', scorer.tags_acc
    print 'UAS', scorer.uas
    print 'LAS', scorer.las
    print 'NER P', scorer.ents_p
    print 'NER R', scorer.ents_r
    print 'NER F', scorer.ents_f
 if __name__ == '__main__':
    plac.call(main)
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -56,7 +56,8 @@ def get_version(model, comp):
 def download_model(filename):
    util.print_msg("Downloading {f}".format(f=filename))
    download_url = about.__download_url__ + '/' + filename
-    subprocess.call([sys.executable, '-m', 'pip', 'install', download_url],
+    subprocess.call([sys.executable, '-m',
        'pip', 'install', '--no-cache-dir', download_url],
        env=os.environ.copy())
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -43,7 +43,14 @@ def symlink(model_path, link_name, force):
    elif link_path.exists():
        link_path.unlink()
    # Add workaround for Python 2 on Windows (see issue #909)
    if util.is_python2() and util.is_windows():
        import subprocess
        command = ['mklink', '/d', link_path.as_posix(), model_path.as_posix()]
        subprocess.call(command, shell=True)
    else:
        link_path.symlink_to(model_path)
    util.print_msg(
        "{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),
        "You can now load the model via spacy.load('{l}').".format(l=link_name),
--- a/spacy/he/init.py
+++ b/spacy/he/init.py
@ -0,0 +1,18 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 from ..language import Language
 from ..attrs import LANG
 from .language_data import *
 class Hebrew(Language):
    lang = 'he'
    class Defaults(Language.Defaults):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'he'
        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
        stop_words = STOP_WORDS
--- a/spacy/he/language_data.py
+++ b/spacy/he/language_data.py
@ -0,0 +1,17 @@
 # encoding: utf8
 from __future__ import unicode_literals
 from .. import language_data as base
 from ..language_data import update_exc, strings_to_exc
 from .stop_words import STOP_WORDS
 STOP_WORDS = set(STOP_WORDS)
 TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
--- a/spacy/he/stop_words.py
+++ b/spacy/he/stop_words.py
@ -0,0 +1,226 @@
 # encoding: utf8
 from __future__ import unicode_literals
 STOP_WORDS = set("""
 אני
 את
 אתה
 אנחנו
 אתן
 אתם
 הם
 הן
 היא
 הוא
 שלי
 שלו
 שלך
 שלה
 שלנו
 שלכם
 שלכן
 שלהם
 שלהן
 לי
 לו
 לה
 לנו
 לכם
 לכן
 להם
 להן
 אותה
 אותו
 זה
 זאת
 אלה
 אלו
 תחת
 מתחת
 מעל
 בין
 עם
 עד
 נגר
 על
 אל
 מול
 של
 אצל
 כמו
 אחר
 אותו
 בלי
 לפני
 אחרי
 מאחורי
 עלי
 עליו
 עליה
 עליך
 עלינו
 עליכם
 לעיכן
 עליהם
 עליהן
 כל
 כולם
 כולן
 כך
 ככה
 כזה
 זה
 זות
 אותי
 אותה
 אותם
 אותך
 אותו
 אותן
 אותנו
 ואת
 את
 אתכם
 אתכן
 איתי
 איתו
 איתך
 איתה
 איתם
 איתן
 איתנו
 איתכם
 איתכן
 יהיה
 תהיה
 היתי
 היתה
 היה
 להיות
 עצמי
 עצמו
 עצמה
 עצמם
 עצמן
 עצמנו
 עצמהם
 עצמהן
 מי
 מה
 איפה
 היכן
 במקום שבו
 אם
 לאן
 למקום שבו
 מקום בו
 איזה
 מהיכן
 איך
 כיצד
 באיזו מידה
 מתי
 בשעה ש
 כאשר
 כש
 למרות
 לפני
 אחרי
 מאיזו סיבה
 הסיבה שבגללה
 למה
 מדוע
 לאיזו תכלית
 כי
 יש
 אין
 אך
 מנין
 מאין
 מאיפה
 יכל
 יכלה
 יכלו
 יכול
 יכולה
 יכולים
 יכולות
 יוכלו
 יוכל
 מסוגל
 לא
 רק
 אולי
 אין
 לאו
 אי
 כלל
 נגד
 אם
 עם
 אל
 אלה
 אלו
 אף
 על
 מעל
 מתחת
 מצד
 בשביל
 לבין
 באמצע
 בתוך
 דרך
 מבעד
 באמצעות
 למעלה
 למטה
 מחוץ
 מן
 לעבר
 מכאן
 כאן
 הנה
 הרי
 פה
 שם
 אך
 ברם
 שוב
 אבל
 מבלי
 בלי
 מלבד
 רק
 בגלל
 מכיוון
 עד
 אשר
 ואילו
 למרות
 אס
 כמו
 כפי
 אז
 אחרי
 כן
 לכן
 לפיכך
 מאד
 עז
 מעט
 מעטים
 במידה
 שוב
 יותר
 מדי
 גם
 כן
 נו
 אחר
 אחרת
 אחרים
 אחרות
 אשר
 או
 """.split())
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -7,6 +7,8 @@ import ujson as json
 from .en.lemmatizer import INDEX, EXC, RULES
 from .symbols import POS, NOUN, VERB, ADJ, PUNCT
 from .symbols import VerbForm_inf, VerbForm_none
 from .symbols import Number_sing
 from .symbols import Degree_pos
 class Lemmatizer(object):
@ -45,11 +47,20 @@ class Lemmatizer(object):
        morphology = {} if morphology is None else morphology
        others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
        true_morph_key = morphology.get('morph', 0)
-        if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
+        print(univ_pos, morphology)
        if univ_pos == 'noun' and morphology.get('Number') == 'sing':
            return True
-        elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
+        elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
            return True
-        elif true_morph_key in (VerbForm_inf, VerbForm_none):
+        elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
            return True
        elif VerbForm_inf in morphology:
            return True
        elif VerbForm_none in morphology:
            return True
        elif Number_sing in morphology:
            return True
        elif Degree_pos in morphology:
            return True
        else:
            return False
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -25,6 +25,8 @@ def _normalize_props(props):
            if value in POS_IDS:
                value = POS_IDS[value]
            out[key] = value
        elif isinstance(key, int):
            out[key] = value
        elif key.lower() == 'pos':
            out[POS] = POS_IDS[value.upper()]
        else:
@ -32,12 +34,11 @@ def _normalize_props(props):
    return out
 cdef class Morphology:
    def __init__(self, StringStore string_store, tag_map, lemmatizer):
        self.mem = Pool()
        self.strings = string_store
-        self.tag_map = tag_map
+        self.tag_map = {}
        self.lemmatizer = lemmatizer
        self.n_tags = len(tag_map) + 1
        self.tag_names = tuple(sorted(tag_map.keys()))
@ -46,6 +47,7 @@ cdef class Morphology:
        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
            attrs = _normalize_props(attrs)
            self.tag_map[tag_str] = dict(attrs)
            attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
            self.rich_tags[i].id = i
            self.rich_tags[i].name = self.strings[tag_str]
@ -74,11 +76,12 @@ cdef class Morphology:
        # Related to Issue #220
        if Lexeme.c_check_flag(token.lex, IS_SPACE):
            tag_id = self.reverse_index[self.strings['SP']]
        rich_tag = self.rich_tags[tag_id]
        analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
        if analysis is NULL:
            analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
            analysis.tag = self.rich_tags[tag_id]
            tag_str = self.strings[self.rich_tags[tag_id].name]
            analysis.tag = rich_tag
            analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
                                            self.tag_map.get(tag_str, {}))
            self._cache.set(tag_id, token.lex.orth, analysis)
@ -126,8 +129,7 @@ cdef class Morphology:
            else:
                self.assign_feature(&cached.tag.morph, name_id, value_id)
        if cached.lemma == 0:
-            cached.lemma = self.lemmatize(rich_tag.pos, orth,
+            cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs)
                                          self.tag_map.get(tag_str, {}))
        self._cache.set(tag_id, orth, <void*>cached)
    def load_morph_exceptions(self, dict exc):
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -19,6 +19,9 @@ cdef class EntityRecognizer(Parser):
    def add_label(self, label):
        for action in self.moves.action_types:
            self.moves.add_action(action, label)
            if 'actions' in self.cfg:
                self.cfg['actions'].setdefault(action,
                                        {}).setdefault(label, True)
        if isinstance(label, basestring):
            label = self.vocab.strings[label]
        for attr, freqs in self.vocab.serializer_freqs:
@ -37,6 +40,9 @@ cdef class BeamEntityRecognizer(BeamParser):
    def add_label(self, label):
        for action in self.moves.action_types:
            self.moves.add_action(action, label)
            if 'actions' in self.cfg:
                self.cfg['actions'].setdefault(action,
                                        {}).setdefault(label, True)
        if isinstance(label, basestring):
            label = self.vocab.strings[label]
        for attr, freqs in self.vocab.serializer_freqs:
@ -54,6 +60,9 @@ cdef class DependencyParser(Parser):
    def add_label(self, label):
        for action in self.moves.action_types:
            self.moves.add_action(action, label)
            if 'actions' in self.cfg:
                self.cfg['actions'].setdefault(action,
                                        {}).setdefault(label, True)
        if isinstance(label, basestring):
            label = self.vocab.strings[label]
        for attr, freqs in self.vocab.serializer_freqs:
@ -71,6 +80,9 @@ cdef class BeamDependencyParser(BeamParser):
    def add_label(self, label):
        for action in self.moves.action_types:
            self.moves.add_action(action, label)
            if 'actions' in self.cfg:
                self.cfg['actions'].setdefault(action,
                                        {}).setdefault(label, True)
        if isinstance(label, basestring):
            label = self.vocab.strings[label]
        for attr, freqs in self.vocab.serializer_freqs:
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -12,6 +12,8 @@ from ..sv import Swedish
 from ..hu import Hungarian
 from ..fi import Finnish
 from ..bn import Bengali
 from ..he import Hebrew
 from ..tokens import Doc
 from ..strings import StringStore
 from ..lemmatizer import Lemmatizer
@ -78,6 +80,11 @@ def bn_tokenizer():
    return Bengali.Defaults.create_tokenizer()
@pytest.fixture  
 def he_tokenizer():
    return Hebrew.Defaults.create_tokenizer()
@pytest.fixture
 def stringstore():
    return StringStore()
--- a/spacy/tests/he/init.py
+++ b/spacy/tests/he/init.py
--- a/spacy/tests/he/test_tokenizer.py
+++ b/spacy/tests/he/test_tokenizer.py
@ -0,0 +1,17 @@
 # encoding: utf8
 from __future__ import unicode_literals
 import pytest
 ABBREVIATION_TESTS = [
    ('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית'])
 ]
 TESTCASES = ABBREVIATION_TESTS
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
 def test_tokenizer_handles_testcases(he_tokenizer, text, expected_tokens):
    tokens = he_tokenizer(text)
    token_list = [token.text for token in tokens if not token.is_space]
    assert expected_tokens == token_list
--- a/spacy/tests/regression/test_issue595.py
+++ b/spacy/tests/regression/test_issue595.py
@ -12,7 +12,7 @@ import pytest
 def test_issue595():
    """Test lemmatization of base forms"""
    words = ["Do", "n't", "feed", "the", "dog"]
-    tag_map = {'VB': {POS: VERB, 'morph': VerbForm_inf}}
+    tag_map = {'VB': {POS: VERB, VerbForm_inf: True}}
    rules = {"verb": [["ed", "e"]]}
    lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules)
--- a/spacy/tests/regression/test_issue903.py
+++ b/spacy/tests/regression/test_issue903.py
@ -0,0 +1,16 @@
 # coding: utf8
 from __future__ import unicode_literals
 import pytest
 from ...tokens import Doc
@pytest.mark.parametrize('text,tag,lemma',
    [("anus", "NN", "anus"),
     ("princess", "NN", "princess")])
 def test_issue912(en_vocab, text, tag, lemma):
    '''Test base-forms of adjectives are preserved.'''
    doc = Doc(en_vocab, words=[text])
    doc[0].tag_ = tag
    assert doc[0].lemma_ == lemma
--- a/spacy/tests/regression/test_issue910.py
+++ b/spacy/tests/regression/test_issue910.py
@ -0,0 +1,113 @@
 from __future__ import unicode_literals
 import json
 import os
 import random
 import contextlib
 import shutil
 import pytest
 import tempfile
 from pathlib import Path
 import pathlib
 from ...gold import GoldParse
 from ...pipeline import EntityRecognizer
 from ...en import English
 try:
    unicode
 except NameError:
    unicode = str
@pytest.fixture
 def train_data():
    return [
            ["hey",[]],
            ["howdy",[]],
            ["hey there",[]],
            ["hello",[]],
            ["hi",[]],
            ["i'm looking for a place to eat",[]],
            ["i'm looking for a place in the north of town",[[31,36,"location"]]],
            ["show me chinese restaurants",[[8,15,"cuisine"]]],
            ["show me chines restaurants",[[8,14,"cuisine"]]],
            ["yes",[]],
            ["yep",[]],
            ["yeah",[]],
            ["show me a mexican place in the centre",[[31,37,"location"], [10,17,"cuisine"]]],
            ["bye",[]],["goodbye",[]],
            ["good bye",[]],
            ["stop",[]],
            ["end",[]],
            ["i am looking for an indian spot",[[20,26,"cuisine"]]],
            ["search for restaurants",[]],
            ["anywhere in the west",[[16,20,"location"]]],
            ["central indian restaurant",[[0,7,"location"],[8,14,"cuisine"]]],
            ["indeed",[]],
            ["that's right",[]],
            ["ok",[]],
            ["great",[]]
    ]
@pytest.fixture
 def additional_entity_types():
    return ['cuisine', 'location']
@contextlib.contextmanager
 def temp_save_model(model):
    model_dir = Path(tempfile.mkdtemp())
    # store the fine tuned model
    with (model_dir / "config.json").open('w') as file_:
        data = json.dumps(model.cfg)
        if not isinstance(data, unicode):
            data = data.decode('utf8')
        file_.write(data)
    model.model.dump((model_dir / 'model').as_posix())
    yield model_dir
    shutil.rmtree(model_dir.as_posix())
@pytest.mark.xfail
@pytest.mark.models
 def test_issue910(train_data, additional_entity_types):
    '''Test that adding entities and resuming training works passably OK.
    There are two issues here:
    1) We have to readd labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    '''
    nlp = English()
    doc = nlp(u"I am looking for a restaurant in Berlin")
    ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
    # Fine tune the ner model
    for entity_type in additional_entity_types:
        if entity_type not in nlp.entity.cfg['actions']['1']:
            nlp.entity.add_label(entity_type)
    nlp.entity.learn_rate = 0.001
    for itn in range(4):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            nlp.tagger(doc)
            gold = GoldParse(doc, entities=entity_offsets)
            loss = nlp.entity.update(doc, gold)
    with temp_save_model(nlp.entity) as model_dir:
        # Load the fine tuned model
        loaded_ner = EntityRecognizer.load(model_dir, nlp.vocab)
    for entity_type in additional_entity_types:
        if entity_type not in loaded_ner.cfg['actions']['1']:
            loaded_ner.add_label(entity_type)
    doc = nlp(u"I am looking for a restaurant in Berlin", entity=False)
    nlp.tagger(doc)
    loaded_ner(doc)
    ents_after_train = [(ent.label_, ent.text) for ent in doc.ents]
    assert ents_before_train == ents_after_train
--- a/spacy/tests/regression/test_issue912.py
+++ b/spacy/tests/regression/test_issue912.py
@ -0,0 +1,14 @@
 # coding: utf8
 from __future__ import unicode_literals
 import pytest
 from ...tokens import Doc
@pytest.mark.parametrize('text,tag,lemma', [("inner", "JJ", "inner")])
 def test_issue912(en_vocab, text, tag, lemma):
    '''Test base-forms of adjectives are preserved.'''
    doc = Doc(en_vocab, words=[text])
    doc[0].tag_ = tag
    assert doc[0].lemma_ == lemma
--- a/spacy/util.py
+++ b/spacy/util.py
@ -153,6 +153,16 @@ def check_renamed_kwargs(renamed, kwargs):
            raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
 def is_windows():
    """Check if user is on Windows."""
    return sys.platform.startswith('win')
 def is_python2():
    """Check if Python 2 is used."""
    return sys.version.startswith('2.')
 def parse_package_meta(package_path, package, require=True):
    location = os.path.join(str(package_path), package, 'meta.json')
    if os.path.isfile(location):
--- a/website/docs/usage/training.jade
+++ b/website/docs/usage/training.jade
@ -82,7 +82,7 @@ p
    |  conjunction features out of the atomic predictors. Let's say you have
    |  two atomic predictors asking, "What is the part-of-speech of the
    |  previous token?", and "What is the part-of-speech of the previous
-    |  previous token?". These ppredictors will introduce a number of features,
+    |  previous token?". These predictors will introduce a number of features,
    |  e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction
    |  template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ].