From 692eb0603d5305a19407302721d4cd6790235496 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 20 Mar 2017 18:24:44 +0100 Subject: [PATCH 01/15] Fix high memory usage in download command Due to PyPi issue #2984, installing large packages via pip causes a large spike in memory usage. The recommended fix is to disable caching. --- spacy/cli/download.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 446de4a37..56dbd5264 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -56,7 +56,8 @@ def get_version(model, comp): def download_model(filename): util.print_msg("Downloading {f}".format(f=filename)) download_url = about.__download_url__ + '/' + filename - subprocess.call([sys.executable, '-m', 'pip', 'install', download_url], + subprocess.call([sys.executable, '-m', + 'pip', 'install', '--no-cache-dir', download_url], env=os.environ.copy()) From 8bc05c2ba97dd51fa9a066def0ab82a97ca55d11 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 23 Mar 2017 11:07:59 +0100 Subject: [PATCH 02/15] Delete old training scripts (resolves #911) --- bin/parser/conll_parse.py | 130 ------------------- bin/parser/nn_train.py | 261 -------------------------------------- 2 files changed, 391 deletions(-) delete mode 100644 bin/parser/conll_parse.py delete mode 100755 bin/parser/nn_train.py diff --git a/bin/parser/conll_parse.py b/bin/parser/conll_parse.py deleted file mode 100644 index 85a81c432..000000000 --- a/bin/parser/conll_parse.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python -from __future__ import division -from __future__ import unicode_literals - -import os -from os import path -import shutil -import codecs -import random -import time -import gzip - -import plac -import cProfile -import pstats - -import spacy.util -from spacy.en import English -from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir - -from spacy.syntax.parser import GreedyParser -from spacy.syntax.parser import OracleError -from spacy.syntax.util import Config - - -def is_punct_label(label): - return label == 'P' or label.lower() == 'punct' - - -def read_gold(file_): - """Read a standard CoNLL/MALT-style format""" - sents = [] - for sent_str in file_.read().strip().split('\n\n'): - ids = [] - words = [] - heads = [] - labels = [] - tags = [] - for i, line in enumerate(sent_str.split('\n')): - id_, word, pos_string, head_idx, label = _parse_line(line) - words.append(word) - if head_idx == -1: - head_idx = i - ids.append(id_) - heads.append(head_idx) - labels.append(label) - tags.append(pos_string) - text = ' '.join(words) - sents.append((text, [words], ids, words, tags, heads, labels)) - return sents - - -def _parse_line(line): - pieces = line.split() - id_ = int(pieces[0]) - word = pieces[1] - pos = pieces[3] - head_idx = int(pieces[6]) - label = pieces[7] - return id_, word, pos, head_idx, label - - -def iter_data(paragraphs, tokenizer, gold_preproc=False): - for raw, tokenized, ids, words, tags, heads, labels in paragraphs: - assert len(words) == len(heads) - for words in tokenized: - sent_ids = ids[:len(words)] - sent_tags = tags[:len(words)] - sent_heads = heads[:len(words)] - sent_labels = labels[:len(words)] - sent_heads = _map_indices_to_tokens(sent_ids, sent_heads) - tokens = tokenizer.tokens_from_list(words) - yield tokens, sent_tags, sent_heads, sent_labels - ids = ids[len(words):] - tags = tags[len(words):] - heads = heads[len(words):] - labels = labels[len(words):] - - -def _map_indices_to_tokens(ids, heads): - mapped = [] - for head in heads: - if head not in ids: - mapped.append(None) - else: - mapped.append(ids.index(head)) - return mapped - - - -def evaluate(Language, dev_loc, model_dir): - global loss - nlp = Language() - n_corr = 0 - pos_corr = 0 - n_tokens = 0 - total = 0 - skipped = 0 - loss = 0 - with codecs.open(dev_loc, 'r', 'utf8') as file_: - paragraphs = read_gold(file_) - for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer): - assert len(tokens) == len(labels) - nlp.tagger.tag_from_strings(tokens, tag_strs) - nlp.parser(tokens) - for i, token in enumerate(tokens): - try: - pos_corr += token.tag_ == tag_strs[i] - except: - print i, token.orth_, token.tag - raise - n_tokens += 1 - if heads[i] is None: - skipped += 1 - continue - if is_punct_label(labels[i]): - continue - n_corr += token.head.i == heads[i] - total += 1 - print loss, skipped, (loss+skipped + total) - print pos_corr / n_tokens - return float(n_corr) / (total + loss) - - -def main(dev_loc, model_dir): - print evaluate(English, dev_loc, model_dir) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py deleted file mode 100755 index 72c9e04f1..000000000 --- a/bin/parser/nn_train.py +++ /dev/null @@ -1,261 +0,0 @@ -#!/usr/bin/env python -from __future__ import division -from __future__ import unicode_literals - -import os -from os import path -import shutil -import codecs -import random - -import plac -import cProfile -import pstats -import re - -import spacy.util -from spacy.en import English -from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir - -from spacy.syntax.util import Config -from spacy.gold import read_json_file -from spacy.gold import GoldParse - -from spacy.scorer import Scorer - -from spacy.syntax.parser import Parser, get_templates -from spacy._theano import TheanoModel - -import theano -import theano.tensor as T - -from theano.printing import Print - -import numpy -from collections import OrderedDict, defaultdict - - -theano.config.profile = False -theano.config.floatX = 'float32' -floatX = theano.config.floatX - - -def L1(L1_reg, *weights): - return L1_reg * sum(abs(w).sum() for w in weights) - - -def L2(L2_reg, *weights): - return L2_reg * sum((w ** 2).sum() for w in weights) - - -def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6): - updates = OrderedDict() - for param in params: - value = param.get_value(borrow=True) - accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), - broadcastable=param.broadcastable) - - grad = T.grad(loss, param) - accu_new = rho * accu + (1 - rho) * grad ** 2 - updates[accu] = accu_new - updates[param] = param - (eta * grad / T.sqrt(accu_new + eps)) - return updates - - -def relu(x): - return x * (x > 0) - - -def feed_layer(activation, weights, bias, input_): - return activation(T.dot(input_, weights) + bias) - - -def init_weights(n_in, n_out): - rng = numpy.random.RandomState(1235) - - weights = numpy.asarray( - rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in), - dtype=theano.config.floatX - ) - bias = numpy.zeros((n_out,), dtype=theano.config.floatX) - return [wrapper(weights, name='W'), wrapper(bias, name='b')] - - -def compile_model(n_classes, n_hidden, n_in, optimizer): - x = T.vector('x') - costs = T.ivector('costs') - loss = T.scalar('loss') - - maxent_W, maxent_b = init_weights(n_hidden, n_classes) - hidden_W, hidden_b = init_weights(n_in, n_hidden) - - # Feed the inputs forward through the network - p_y_given_x = feed_layer( - T.nnet.softmax, - maxent_W, - maxent_b, - feed_layer( - relu, - hidden_W, - hidden_b, - x)) - - loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8) - - train_model = theano.function( - name='train_model', - inputs=[x, costs], - outputs=[p_y_given_x[0], T.grad(loss, x), loss], - updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]), - on_unused_input='warn' - ) - - evaluate_model = theano.function( - name='evaluate_model', - inputs=[x], - outputs=[ - feed_layer( - T.nnet.softmax, - maxent_W, - maxent_b, - feed_layer( - relu, - hidden_W, - hidden_b, - x - ) - )[0] - ] - ) - return train_model, evaluate_model - - -def score_model(scorer, nlp, annot_tuples, verbose=False): - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.parser(tokens) - gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold, verbose=verbose) - - -def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', - eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10, - seed=0, n_sents=0, verbose=False): - - dep_model_dir = path.join(model_dir, 'deps') - pos_model_dir = path.join(model_dir, 'pos') - if path.exists(dep_model_dir): - shutil.rmtree(dep_model_dir) - if path.exists(pos_model_dir): - shutil.rmtree(pos_model_dir) - os.mkdir(dep_model_dir) - os.mkdir(pos_model_dir) - setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) - - Config.write(dep_model_dir, 'config', - seed=seed, - templates=tuple(), - labels=Language.ParserTransitionSystem.get_labels(gold_tuples), - vector_lengths=(nv_word, nv_tag, nv_label), - hidden_nodes=nv_hidden, - eta=eta, - mu=mu - ) - - # Bake-in hyper-parameters - optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps) - nlp = Language(data_dir=model_dir) - n_classes = nlp.parser.model.n_classes - train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer) - nlp.parser.model = TheanoModel(n_classes, input_spec, train, - predict, model_loc) - - if n_sents > 0: - gold_tuples = gold_tuples[:n_sents] - print "Itn.\tP.Loss\tUAS\tTag %\tToken %" - log_loc = path.join(model_dir, 'job.log') - for itn in range(n_iter): - scorer = Scorer() - loss = 0 - for _, sents in gold_tuples: - for annot_tuples, ctnt in sents: - if len(annot_tuples[1]) == 1: - continue - score_model(scorer, nlp, annot_tuples) - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - gold = GoldParse(tokens, annot_tuples, make_projective=True) - assert gold.is_projective - loss += nlp.parser.train(tokens, gold) - nlp.tagger.train(tokens, gold.tags) - random.shuffle(gold_tuples) - logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, - scorer.tags_acc, - scorer.token_acc) - print logline - with open(log_loc, 'aw') as file_: - file_.write(logline + '\n') - nlp.parser.model.end_training() - nlp.tagger.model.end_training() - nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt')) - return nlp - - -def evaluate(nlp, gold_tuples, gold_preproc=True): - scorer = Scorer() - for raw_text, sents in gold_tuples: - for annot_tuples, brackets in sents: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.parser(tokens) - gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold) - return scorer - - -@plac.annotations( - train_loc=("Location of training file or directory"), - dev_loc=("Location of development file or directory"), - model_dir=("Location of output model directory",), - eval_only=("Skip training, and only evaluate", "flag", "e", bool), - n_sents=("Number of training sentences", "option", "n", int), - n_iter=("Number of training iterations", "option", "i", int), - verbose=("Verbose error reporting", "flag", "v", bool), - - nv_word=("Word vector length", "option", "W", int), - nv_tag=("Tag vector length", "option", "T", int), - nv_label=("Label vector length", "option", "L", int), - nv_hidden=("Hidden nodes length", "option", "H", int), - eta=("Learning rate", "option", "E", float), - mu=("Momentum", "option", "M", float), -) -def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False, - nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10, - eta=0.1, mu=0.9, eval_only=False): - - - - - gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id'])) - - nlp = train(English, gold_train, model_dir, - feat_set='embed', - eta=eta, mu=mu, - nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden, - n_sents=n_sents, n_iter=n_iter, - verbose=verbose) - - scorer = evaluate(nlp, list(read_json_file(dev_loc))) - - print 'TOK', 100-scorer.token_acc - print 'POS', scorer.tags_acc - print 'UAS', scorer.uas - print 'LAS', scorer.las - - print 'NER P', scorer.ents_p - print 'NER R', scorer.ents_r - print 'NER F', scorer.ents_f - - -if __name__ == '__main__': - plac.call(main) From d137b26c3290dc7adf3a7044af3388f54de063b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Thu, 23 Mar 2017 21:28:57 +0100 Subject: [PATCH 03/15] Fix doc typo --- website/docs/usage/training.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade index 6963730ab..6a71ba68a 100644 --- a/website/docs/usage/training.jade +++ b/website/docs/usage/training.jade @@ -79,7 +79,7 @@ p | conjunction features out of the atomic predictors. Let's say you have | two atomic predictors asking, "What is the part-of-speech of the | previous token?", and "What is the part-of-speech of the previous - | previous token?". These ppredictors will introduce a number of features, + | previous token?". These predictors will introduce a number of features, | e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction | template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ]. From f40fbc3710edc2a19199cc3d01403f129dad6965 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 23 Mar 2017 23:38:57 +0100 Subject: [PATCH 04/15] Add test for Issue #910: Resuming entity training --- spacy/tests/regression/test_issue910.py | 113 ++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 spacy/tests/regression/test_issue910.py diff --git a/spacy/tests/regression/test_issue910.py b/spacy/tests/regression/test_issue910.py new file mode 100644 index 000000000..9b2b2287b --- /dev/null +++ b/spacy/tests/regression/test_issue910.py @@ -0,0 +1,113 @@ +from __future__ import unicode_literals +import json +import os +import random +import contextlib +import shutil +import pytest +import tempfile +from pathlib import Path + + +import pathlib +from ...gold import GoldParse +from ...pipeline import EntityRecognizer +from ...en import English + +try: + unicode +except NameError: + unicode = str + + +@pytest.fixture +def train_data(): + return [ + ["hey",[]], + ["howdy",[]], + ["hey there",[]], + ["hello",[]], + ["hi",[]], + ["i'm looking for a place to eat",[]], + ["i'm looking for a place in the north of town",[[31,36,"location"]]], + ["show me chinese restaurants",[[8,15,"cuisine"]]], + ["show me chines restaurants",[[8,14,"cuisine"]]], + ["yes",[]], + ["yep",[]], + ["yeah",[]], + ["show me a mexican place in the centre",[[31,37,"location"], [10,17,"cuisine"]]], + ["bye",[]],["goodbye",[]], + ["good bye",[]], + ["stop",[]], + ["end",[]], + ["i am looking for an indian spot",[[20,26,"cuisine"]]], + ["search for restaurants",[]], + ["anywhere in the west",[[16,20,"location"]]], + ["central indian restaurant",[[0,7,"location"],[8,14,"cuisine"]]], + ["indeed",[]], + ["that's right",[]], + ["ok",[]], + ["great",[]] + ] + +@pytest.fixture +def additional_entity_types(): + return ['cuisine', 'location'] + + +@contextlib.contextmanager +def temp_save_model(model): + model_dir = Path(tempfile.mkdtemp()) + # store the fine tuned model + with (model_dir / "config.json").open('w') as file_: + data = json.dumps(model.cfg) + if not isinstance(data, unicode): + data = data.decode('utf8') + file_.write(data) + model.model.dump((model_dir / 'model').as_posix()) + yield model_dir + shutil.rmtree(model_dir.as_posix()) + + + +@pytest.mark.xfail +@pytest.mark.models +def test_issue910(train_data, additional_entity_types): + '''Test that adding entities and resuming training works passably OK. + There are two issues here: + + 1) We have to readd labels. This isn't very nice. + 2) There's no way to set the learning rate for the weight update, so we + end up out-of-scale, causing it to learn too fast. + ''' + nlp = English() + doc = nlp(u"I am looking for a restaurant in Berlin") + ents_before_train = [(ent.label_, ent.text) for ent in doc.ents] + # Fine tune the ner model + for entity_type in additional_entity_types: + if entity_type not in nlp.entity.cfg['actions']['1']: + nlp.entity.add_label(entity_type) + + nlp.entity.learn_rate = 0.001 + for itn in range(4): + random.shuffle(train_data) + for raw_text, entity_offsets in train_data: + doc = nlp.make_doc(raw_text) + nlp.tagger(doc) + gold = GoldParse(doc, entities=entity_offsets) + loss = nlp.entity.update(doc, gold) + + with temp_save_model(nlp.entity) as model_dir: + # Load the fine tuned model + loaded_ner = EntityRecognizer.load(model_dir, nlp.vocab) + + for entity_type in additional_entity_types: + if entity_type not in loaded_ner.cfg['actions']['1']: + loaded_ner.add_label(entity_type) + + doc = nlp(u"I am looking for a restaurant in Berlin", entity=False) + nlp.tagger(doc) + loaded_ner(doc) + + ents_after_train = [(ent.label_, ent.text) for ent in doc.ents] + assert ents_before_train == ents_after_train From da135bd823dcd57520c2032ccf536adb35e9c087 Mon Sep 17 00:00:00 2001 From: Iddo Berger Date: Fri, 24 Mar 2017 18:27:44 +0300 Subject: [PATCH 05/15] add hebrew tokenizer --- spacy/he/__init__.py | 18 +++ spacy/he/language_data.py | 17 +++ spacy/he/stop_words.py | 226 +++++++++++++++++++++++++++++++ spacy/tests/conftest.py | 6 + spacy/tests/he/__init__.py | 0 spacy/tests/he/test_tokenizer.py | 17 +++ 6 files changed, 284 insertions(+) create mode 100644 spacy/he/__init__.py create mode 100644 spacy/he/language_data.py create mode 100644 spacy/he/stop_words.py create mode 100644 spacy/tests/he/__init__.py create mode 100644 spacy/tests/he/test_tokenizer.py diff --git a/spacy/he/__init__.py b/spacy/he/__init__.py new file mode 100644 index 000000000..a3e86ed73 --- /dev/null +++ b/spacy/he/__init__.py @@ -0,0 +1,18 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +from ..language import Language +from ..attrs import LANG + +from .language_data import * + + +class Hebrew(Language): + lang = 'he' + + class Defaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'he' + + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS diff --git a/spacy/he/language_data.py b/spacy/he/language_data.py new file mode 100644 index 000000000..a4a657c33 --- /dev/null +++ b/spacy/he/language_data.py @@ -0,0 +1,17 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from .. import language_data as base +from ..language_data import update_exc, strings_to_exc + +from .stop_words import STOP_WORDS + + +STOP_WORDS = set(STOP_WORDS) + + +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + + +__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/he/stop_words.py b/spacy/he/stop_words.py new file mode 100644 index 000000000..2914fa0d5 --- /dev/null +++ b/spacy/he/stop_words.py @@ -0,0 +1,226 @@ +# encoding: utf8 +from __future__ import unicode_literals + +STOP_WORDS = set(""" +אני +את +אתה +אנחנו +אתן +אתם +הם +הן +היא +הוא +שלי +שלו +שלך +שלה +שלנו +שלכם +שלכן +שלהם +שלהן +לי +לו +לה +לנו +לכם +לכן +להם +להן +אותה +אותו +זה +זאת +אלה +אלו +תחת +מתחת +מעל +בין +עם +עד +נגר +על +אל +מול +של +אצל +כמו +אחר +אותו +בלי +לפני +אחרי +מאחורי +עלי +עליו +עליה +עליך +עלינו +עליכם +לעיכן +עליהם +עליהן +כל +כולם +כולן +כך +ככה +כזה +זה +זות +אותי +אותה +אותם +אותך +אותו +אותן +אותנו +ואת +את +אתכם +אתכן +איתי +איתו +איתך +איתה +איתם +איתן +איתנו +איתכם +איתכן +יהיה +תהיה +היתי +היתה +היה +להיות +עצמי +עצמו +עצמה +עצמם +עצמן +עצמנו +עצמהם +עצמהן +מי +מה +איפה +היכן +במקום שבו +אם +לאן +למקום שבו +מקום בו +איזה +מהיכן +איך +כיצד +באיזו מידה +מתי +בשעה ש +כאשר +כש +למרות +לפני +אחרי +מאיזו סיבה +הסיבה שבגללה +למה +מדוע +לאיזו תכלית +כי +יש +אין +אך +מנין +מאין +מאיפה +יכל +יכלה +יכלו +יכול +יכולה +יכולים +יכולות +יוכלו +יוכל +מסוגל +לא +רק +אולי +אין +לאו +אי +כלל +נגד +אם +עם +אל +אלה +אלו +אף +על +מעל +מתחת +מצד +בשביל +לבין +באמצע +בתוך +דרך +מבעד +באמצעות +למעלה +למטה +מחוץ +מן +לעבר +מכאן +כאן +הנה +הרי +פה +שם +אך +ברם +שוב +אבל +מבלי +בלי +מלבד +רק +בגלל +מכיוון +עד +אשר +ואילו +למרות +אס +כמו +כפי +אז +אחרי +כן +לכן +לפיכך +מאד +עז +מעט +מעטים +במידה +שוב +יותר +מדי +גם +כן +נו +אחר +אחרת +אחרים +אחרות +אשר +או +""".split()) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index b6dcb905a..f049d2f91 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -11,6 +11,7 @@ from ..nl import Dutch from ..sv import Swedish from ..hu import Hungarian from ..fi import Finnish +from ..he import Hebrew from ..tokens import Doc from ..strings import StringStore from ..lemmatizer import Lemmatizer @@ -73,6 +74,11 @@ def sv_tokenizer(): return Swedish.Defaults.create_tokenizer() +@pytest.fixture +def he_tokenizer(): + return Hebrew.Defaults.create_tokenizer() + + @pytest.fixture def stringstore(): return StringStore() diff --git a/spacy/tests/he/__init__.py b/spacy/tests/he/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/he/test_tokenizer.py b/spacy/tests/he/test_tokenizer.py new file mode 100644 index 000000000..a6c65805a --- /dev/null +++ b/spacy/tests/he/test_tokenizer.py @@ -0,0 +1,17 @@ +# encoding: utf8 +from __future__ import unicode_literals + +import pytest + +ABBREVIATION_TESTS = [ + ('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית']) +] + +TESTCASES = ABBREVIATION_TESTS + + +@pytest.mark.parametrize('text,expected_tokens', TESTCASES) +def test_tokenizer_handles_testcases(he_tokenizer, text, expected_tokens): + tokens = he_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list \ No newline at end of file From 8a0ee5fc81a0a3110557550801ae5db9c6c5fe6f Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 25 Mar 2017 10:18:38 +0100 Subject: [PATCH 06/15] Fix download commands --- website/docs/api/language-models.jade | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade index aad86f614..d93e617dd 100644 --- a/website/docs/api/language-models.jade +++ b/website/docs/api/language-models.jade @@ -5,8 +5,8 @@ include ../../_includes/_mixins p spaCy currently supports the following languages and capabilities: +aside-code("Download language models", "bash"). - python -m spacy.en.download all - python -m spacy.de.download all + python -m spacy download en + python -m spacy download de +table([ "Language", "Token", "SBD", "Lemma", "POS", "NER", "Dep", "Vector", "Sentiment"]) +row From b7f714b4988a267bed384badc79fd8272e5fb55c Mon Sep 17 00:00:00 2001 From: Greg Baker Date: Sat, 25 Mar 2017 21:36:38 +1100 Subject: [PATCH 07/15] Possible solution to #909 --- spacy/cli/link.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 0f28187b3..833b9aff2 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -5,6 +5,7 @@ import pip from pathlib import Path import importlib from .. import util +import sys def link(origin, link_name, force=False): @@ -43,7 +44,13 @@ def symlink(model_path, link_name, force): elif link_path.exists(): link_path.unlink() - link_path.symlink_to(model_path) + if sys.version.startswith('2.') and sys.platform.startswith('win'): + import subprocess + subprocess.call(['mklink','/d',str(link_path), str(model_path)], + shell=True) + else: + link_path.symlink_to(model_path) + util.print_msg( "{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()), "You can now load the model via spacy.load('{l}').".format(l=link_name), From fdec758113e3e0782241d7d1a94d4a0c8a611215 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 25 Mar 2017 14:04:02 +0100 Subject: [PATCH 08/15] Add is_windows and is_python2 utility functions --- spacy/util.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index 893ba87c1..6c25ce0e8 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -153,6 +153,16 @@ def check_renamed_kwargs(renamed, kwargs): raise TypeError("Keyword argument %s now renamed to %s" % (old, new)) +def is_windows(): + """Check if user is on Windows.""" + return sys.platform.startswith('win') + + +def is_python2(): + """Check if Python 2 is used.""" + return sys.version.startswith('2.') + + def parse_package_meta(package_path, package, require=True): location = os.path.join(str(package_path), package, 'meta.json') if os.path.isfile(location): From 97814f8da6b5d7e5da12a8d0ac62ce441730e479 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 25 Mar 2017 14:04:27 +0100 Subject: [PATCH 09/15] Update Windows Python 2 link workaround to use helper functions --- spacy/cli/link.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 833b9aff2..3777fd85f 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -5,7 +5,6 @@ import pip from pathlib import Path import importlib from .. import util -import sys def link(origin, link_name, force=False): @@ -44,10 +43,11 @@ def symlink(model_path, link_name, force): elif link_path.exists(): link_path.unlink() - if sys.version.startswith('2.') and sys.platform.startswith('win'): + # Add workaround for Python 2 on Windows (see issue #909) + if util.is_python2() and util.is_windows(): import subprocess - subprocess.call(['mklink','/d',str(link_path), str(model_path)], - shell=True) + command = ['mklink', '/d', link_path.as_posix(), model_path.as_posix()] + subprocess.call(command, shell=True) else: link_path.symlink_to(model_path) From 4454c1b23fc3793257ef20174389337f48be596a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 25 Mar 2017 21:29:57 +0100 Subject: [PATCH 10/15] Block lemmatization of base-form adjectives Fixes check that an adjective is a base form (as opposed to a comparative or superlative), so that it's not lemmatized. e.g. inner -!> inn. Closes #912. --- spacy/lemmatizer.py | 8 +++++++- spacy/tests/regression/test_issue912.py | 14 ++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/regression/test_issue912.py diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 78ff43039..434c49e91 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -7,6 +7,8 @@ import ujson as json from .en.lemmatizer import INDEX, EXC, RULES from .symbols import POS, NOUN, VERB, ADJ, PUNCT from .symbols import VerbForm_inf, VerbForm_none +from .symbols import Number_sing +from .symbols import Degree_pos class Lemmatizer(object): @@ -42,6 +44,7 @@ class Lemmatizer(object): def is_base_form(self, univ_pos, morphology=None): '''Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely.''' + print("Is base form?", univ_pos, morphology) morphology = {} if morphology is None else morphology others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] true_morph_key = morphology.get('morph', 0) @@ -49,7 +52,10 @@ class Lemmatizer(object): return True elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others: return True - elif true_morph_key in (VerbForm_inf, VerbForm_none): + elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': + return True + elif true_morph_key in \ + (VerbForm_inf, VerbForm_none, Number_sing, Degree_pos): return True else: return False diff --git a/spacy/tests/regression/test_issue912.py b/spacy/tests/regression/test_issue912.py new file mode 100644 index 000000000..791e2e152 --- /dev/null +++ b/spacy/tests/regression/test_issue912.py @@ -0,0 +1,14 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from ...tokens import Doc + + +@pytest.mark.parametrize('text,tag,lemma', [("inner", "JJ", "inner")]) +def test_issue912(en_vocab, text, tag, lemma): + '''Test base-forms of adjectives are preserved.''' + doc = Doc(en_vocab, words=[text]) + doc[0].tag_ = tag + assert doc[0].lemma_ == lemma + From 850d35dcb31f040e95d5329a4b6ef6c30ab6536e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 25 Mar 2017 21:49:10 +0100 Subject: [PATCH 11/15] Make morphology use int attributes internally The morphology class was calling the lemmatizer inconsistently, which some string-valued attributes. This caused Issue #903. --- spacy/morphology.pyx | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index e98ef1e92..5b22dd28e 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -32,12 +32,11 @@ def _normalize_props(props): return out - cdef class Morphology: def __init__(self, StringStore string_store, tag_map, lemmatizer): self.mem = Pool() self.strings = string_store - self.tag_map = tag_map + self.tag_map = {} self.lemmatizer = lemmatizer self.n_tags = len(tag_map) + 1 self.tag_names = tuple(sorted(tag_map.keys())) @@ -52,6 +51,7 @@ cdef class Morphology: self.rich_tags[i].morph = 0 self.rich_tags[i].pos = attrs[POS] self.reverse_index[self.rich_tags[i].name] = i + self.tag_map[tag_str] = attrs self._cache = PreshMapArray(self.n_tags) def __reduce__(self): @@ -74,10 +74,10 @@ cdef class Morphology: # Related to Issue #220 if Lexeme.c_check_flag(token.lex, IS_SPACE): tag_id = self.reverse_index[self.strings['SP']] + rich_tag = self.rich_tags[tag_id] analysis = self._cache.get(tag_id, token.lex.orth) if analysis is NULL: analysis = self.mem.alloc(1, sizeof(MorphAnalysisC)) - analysis.tag = self.rich_tags[tag_id] tag_str = self.strings[self.rich_tags[tag_id].name] analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth, self.tag_map.get(tag_str, {})) @@ -126,8 +126,7 @@ cdef class Morphology: else: self.assign_feature(&cached.tag.morph, name_id, value_id) if cached.lemma == 0: - cached.lemma = self.lemmatize(rich_tag.pos, orth, - self.tag_map.get(tag_str, {})) + cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs) self._cache.set(tag_id, orth, cached) def load_morph_exceptions(self, dict exc): From 4f400fa486ebf4fa7ef5aa90607cca68acb301a8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 25 Mar 2017 21:51:12 +0100 Subject: [PATCH 12/15] Prevent lemmatization of base nouns Update lemmatizer's base-form check, for change in morphology class. Closes #903. --- spacy/lemmatizer.py | 8 +++----- spacy/tests/regression/test_issue903.py | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 5 deletions(-) create mode 100644 spacy/tests/regression/test_issue903.py diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 434c49e91..d10b40d7b 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -44,18 +44,16 @@ class Lemmatizer(object): def is_base_form(self, univ_pos, morphology=None): '''Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely.''' - print("Is base form?", univ_pos, morphology) morphology = {} if morphology is None else morphology others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] true_morph_key = morphology.get('morph', 0) - if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others: + if univ_pos == 'noun' and morphology.get('Number') == 'sing' and not others: return True - elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others: + elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf' and not others: return True elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': return True - elif true_morph_key in \ - (VerbForm_inf, VerbForm_none, Number_sing, Degree_pos): + elif true_morph_key in (VerbForm_inf, VerbForm_none, Number_sing, Degree_pos): return True else: return False diff --git a/spacy/tests/regression/test_issue903.py b/spacy/tests/regression/test_issue903.py new file mode 100644 index 000000000..36acd2dfc --- /dev/null +++ b/spacy/tests/regression/test_issue903.py @@ -0,0 +1,16 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from ...tokens import Doc + + +@pytest.mark.parametrize('text,tag,lemma', + [("anus", "NN", "anus"), + ("princess", "NN", "princess")]) +def test_issue912(en_vocab, text, tag, lemma): + '''Test base-forms of adjectives are preserved.''' + doc = Doc(en_vocab, words=[text]) + doc[0].tag_ = tag + assert doc[0].lemma_ == lemma + From c748907a667c1ba0f9b2576edaf2a92951c197cb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 25 Mar 2017 21:56:41 +0100 Subject: [PATCH 13/15] Fix errors in previous commit --- spacy/lemmatizer.py | 13 ++++++++++--- spacy/morphology.pyx | 5 ++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index d10b40d7b..5cd4842a9 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -47,13 +47,20 @@ class Lemmatizer(object): morphology = {} if morphology is None else morphology others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] true_morph_key = morphology.get('morph', 0) - if univ_pos == 'noun' and morphology.get('Number') == 'sing' and not others: + print(univ_pos, morphology) + if univ_pos == 'noun' and morphology.get('Number') == 'sing': return True - elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf' and not others: + elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf': return True elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': return True - elif true_morph_key in (VerbForm_inf, VerbForm_none, Number_sing, Degree_pos): + elif VerbForm_inf in morphology: + return True + elif VerbForm_none in morphology: + return True + elif Number_sing in morphology: + return True + elif Degree_pos in morphology: return True else: return False diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 5b22dd28e..372bbb5ce 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -25,6 +25,8 @@ def _normalize_props(props): if value in POS_IDS: value = POS_IDS[value] out[key] = value + elif isinstance(key, int): + out[key] = value elif key.lower() == 'pos': out[POS] = POS_IDS[value.upper()] else: @@ -45,13 +47,13 @@ cdef class Morphology: self.rich_tags = self.mem.alloc(self.n_tags, sizeof(RichTagC)) for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): attrs = _normalize_props(attrs) + self.tag_map[tag_str] = dict(attrs) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) self.rich_tags[i].id = i self.rich_tags[i].name = self.strings[tag_str] self.rich_tags[i].morph = 0 self.rich_tags[i].pos = attrs[POS] self.reverse_index[self.rich_tags[i].name] = i - self.tag_map[tag_str] = attrs self._cache = PreshMapArray(self.n_tags) def __reduce__(self): @@ -79,6 +81,7 @@ cdef class Morphology: if analysis is NULL: analysis = self.mem.alloc(1, sizeof(MorphAnalysisC)) tag_str = self.strings[self.rich_tags[tag_id].name] + analysis.tag = rich_tag analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth, self.tag_map.get(tag_str, {})) self._cache.set(tag_id, token.lex.orth, analysis) From b94286de30fdd154cfcd1c88889819c047693f7a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 25 Mar 2017 22:35:07 +0100 Subject: [PATCH 14/15] Fix regression test --- spacy/tests/regression/test_issue595.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue595.py b/spacy/tests/regression/test_issue595.py index 6c73a621a..4a83a4020 100644 --- a/spacy/tests/regression/test_issue595.py +++ b/spacy/tests/regression/test_issue595.py @@ -12,7 +12,7 @@ import pytest def test_issue595(): """Test lemmatization of base forms""" words = ["Do", "n't", "feed", "the", "dog"] - tag_map = {'VB': {POS: VERB, 'morph': VerbForm_inf}} + tag_map = {'VB': {POS: VERB, VerbForm_inf: True}} rules = {"verb": [["ed", "e"]]} lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules) From 2f63806ddb336800f26c04e29a7ecdc4d12be13f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 25 Mar 2017 22:35:44 +0100 Subject: [PATCH 15/15] Update config when adding label. Re #910 --- spacy/pipeline.pyx | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index b2d622329..ea8221cff 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -19,6 +19,9 @@ cdef class EntityRecognizer(Parser): def add_label(self, label): for action in self.moves.action_types: self.moves.add_action(action, label) + if 'actions' in self.cfg: + self.cfg['actions'].setdefault(action, + {}).setdefault(label, True) if isinstance(label, basestring): label = self.vocab.strings[label] for attr, freqs in self.vocab.serializer_freqs: @@ -37,6 +40,9 @@ cdef class BeamEntityRecognizer(BeamParser): def add_label(self, label): for action in self.moves.action_types: self.moves.add_action(action, label) + if 'actions' in self.cfg: + self.cfg['actions'].setdefault(action, + {}).setdefault(label, True) if isinstance(label, basestring): label = self.vocab.strings[label] for attr, freqs in self.vocab.serializer_freqs: @@ -54,6 +60,9 @@ cdef class DependencyParser(Parser): def add_label(self, label): for action in self.moves.action_types: self.moves.add_action(action, label) + if 'actions' in self.cfg: + self.cfg['actions'].setdefault(action, + {}).setdefault(label, True) if isinstance(label, basestring): label = self.vocab.strings[label] for attr, freqs in self.vocab.serializer_freqs: @@ -71,6 +80,9 @@ cdef class BeamDependencyParser(BeamParser): def add_label(self, label): for action in self.moves.action_types: self.moves.add_action(action, label) + if 'actions' in self.cfg: + self.cfg['actions'].setdefault(action, + {}).setdefault(label, True) if isinstance(label, basestring): label = self.vocab.strings[label] for attr, freqs in self.vocab.serializer_freqs: