Merge branch 'master' into develop

This commit is contained in:
Matthew Honnibal 2017-03-26 04:45:43 -05:00
commit 5eac089fbe
19 changed files with 483 additions and 403 deletions

View File

@ -1,130 +0,0 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
import os
from os import path
import shutil
import codecs
import random
import time
import gzip
import plac
import cProfile
import pstats
import spacy.util
from spacy.en import English
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
from spacy.syntax.parser import GreedyParser
from spacy.syntax.parser import OracleError
from spacy.syntax.util import Config
def is_punct_label(label):
return label == 'P' or label.lower() == 'punct'
def read_gold(file_):
"""Read a standard CoNLL/MALT-style format"""
sents = []
for sent_str in file_.read().strip().split('\n\n'):
ids = []
words = []
heads = []
labels = []
tags = []
for i, line in enumerate(sent_str.split('\n')):
id_, word, pos_string, head_idx, label = _parse_line(line)
words.append(word)
if head_idx == -1:
head_idx = i
ids.append(id_)
heads.append(head_idx)
labels.append(label)
tags.append(pos_string)
text = ' '.join(words)
sents.append((text, [words], ids, words, tags, heads, labels))
return sents
def _parse_line(line):
pieces = line.split()
id_ = int(pieces[0])
word = pieces[1]
pos = pieces[3]
head_idx = int(pieces[6])
label = pieces[7]
return id_, word, pos, head_idx, label
def iter_data(paragraphs, tokenizer, gold_preproc=False):
for raw, tokenized, ids, words, tags, heads, labels in paragraphs:
assert len(words) == len(heads)
for words in tokenized:
sent_ids = ids[:len(words)]
sent_tags = tags[:len(words)]
sent_heads = heads[:len(words)]
sent_labels = labels[:len(words)]
sent_heads = _map_indices_to_tokens(sent_ids, sent_heads)
tokens = tokenizer.tokens_from_list(words)
yield tokens, sent_tags, sent_heads, sent_labels
ids = ids[len(words):]
tags = tags[len(words):]
heads = heads[len(words):]
labels = labels[len(words):]
def _map_indices_to_tokens(ids, heads):
mapped = []
for head in heads:
if head not in ids:
mapped.append(None)
else:
mapped.append(ids.index(head))
return mapped
def evaluate(Language, dev_loc, model_dir):
global loss
nlp = Language()
n_corr = 0
pos_corr = 0
n_tokens = 0
total = 0
skipped = 0
loss = 0
with codecs.open(dev_loc, 'r', 'utf8') as file_:
paragraphs = read_gold(file_)
for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer):
assert len(tokens) == len(labels)
nlp.tagger.tag_from_strings(tokens, tag_strs)
nlp.parser(tokens)
for i, token in enumerate(tokens):
try:
pos_corr += token.tag_ == tag_strs[i]
except:
print i, token.orth_, token.tag
raise
n_tokens += 1
if heads[i] is None:
skipped += 1
continue
if is_punct_label(labels[i]):
continue
n_corr += token.head.i == heads[i]
total += 1
print loss, skipped, (loss+skipped + total)
print pos_corr / n_tokens
return float(n_corr) / (total + loss)
def main(dev_loc, model_dir):
print evaluate(English, dev_loc, model_dir)
if __name__ == '__main__':
plac.call(main)

View File

@ -1,261 +0,0 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
import os
from os import path
import shutil
import codecs
import random
import plac
import cProfile
import pstats
import re
import spacy.util
from spacy.en import English
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
from spacy.syntax.util import Config
from spacy.gold import read_json_file
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy.syntax.parser import Parser, get_templates
from spacy._theano import TheanoModel
import theano
import theano.tensor as T
from theano.printing import Print
import numpy
from collections import OrderedDict, defaultdict
theano.config.profile = False
theano.config.floatX = 'float32'
floatX = theano.config.floatX
def L1(L1_reg, *weights):
return L1_reg * sum(abs(w).sum() for w in weights)
def L2(L2_reg, *weights):
return L2_reg * sum((w ** 2).sum() for w in weights)
def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6):
updates = OrderedDict()
for param in params:
value = param.get_value(borrow=True)
accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
grad = T.grad(loss, param)
accu_new = rho * accu + (1 - rho) * grad ** 2
updates[accu] = accu_new
updates[param] = param - (eta * grad / T.sqrt(accu_new + eps))
return updates
def relu(x):
return x * (x > 0)
def feed_layer(activation, weights, bias, input_):
return activation(T.dot(input_, weights) + bias)
def init_weights(n_in, n_out):
rng = numpy.random.RandomState(1235)
weights = numpy.asarray(
rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in),
dtype=theano.config.floatX
)
bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
return [wrapper(weights, name='W'), wrapper(bias, name='b')]
def compile_model(n_classes, n_hidden, n_in, optimizer):
x = T.vector('x')
costs = T.ivector('costs')
loss = T.scalar('loss')
maxent_W, maxent_b = init_weights(n_hidden, n_classes)
hidden_W, hidden_b = init_weights(n_in, n_hidden)
# Feed the inputs forward through the network
p_y_given_x = feed_layer(
T.nnet.softmax,
maxent_W,
maxent_b,
feed_layer(
relu,
hidden_W,
hidden_b,
x))
loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8)
train_model = theano.function(
name='train_model',
inputs=[x, costs],
outputs=[p_y_given_x[0], T.grad(loss, x), loss],
updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]),
on_unused_input='warn'
)
evaluate_model = theano.function(
name='evaluate_model',
inputs=[x],
outputs=[
feed_layer(
T.nnet.softmax,
maxent_W,
maxent_b,
feed_layer(
relu,
hidden_W,
hidden_b,
x
)
)[0]
]
)
return train_model, evaluate_model
def score_model(scorer, nlp, annot_tuples, verbose=False):
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.parser(tokens)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=verbose)
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
seed=0, n_sents=0, verbose=False):
dep_model_dir = path.join(model_dir, 'deps')
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(dep_model_dir):
shutil.rmtree(dep_model_dir)
if path.exists(pos_model_dir):
shutil.rmtree(pos_model_dir)
os.mkdir(dep_model_dir)
os.mkdir(pos_model_dir)
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
Config.write(dep_model_dir, 'config',
seed=seed,
templates=tuple(),
labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
vector_lengths=(nv_word, nv_tag, nv_label),
hidden_nodes=nv_hidden,
eta=eta,
mu=mu
)
# Bake-in hyper-parameters
optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps)
nlp = Language(data_dir=model_dir)
n_classes = nlp.parser.model.n_classes
train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer)
nlp.parser.model = TheanoModel(n_classes, input_spec, train,
predict, model_loc)
if n_sents > 0:
gold_tuples = gold_tuples[:n_sents]
print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
log_loc = path.join(model_dir, 'job.log')
for itn in range(n_iter):
scorer = Scorer()
loss = 0
for _, sents in gold_tuples:
for annot_tuples, ctnt in sents:
if len(annot_tuples[1]) == 1:
continue
score_model(scorer, nlp, annot_tuples)
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
gold = GoldParse(tokens, annot_tuples, make_projective=True)
assert gold.is_projective
loss += nlp.parser.train(tokens, gold)
nlp.tagger.train(tokens, gold.tags)
random.shuffle(gold_tuples)
logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
scorer.tags_acc,
scorer.token_acc)
print logline
with open(log_loc, 'aw') as file_:
file_.write(logline + '\n')
nlp.parser.model.end_training()
nlp.tagger.model.end_training()
nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
return nlp
def evaluate(nlp, gold_tuples, gold_preproc=True):
scorer = Scorer()
for raw_text, sents in gold_tuples:
for annot_tuples, brackets in sents:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.parser(tokens)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold)
return scorer
@plac.annotations(
train_loc=("Location of training file or directory"),
dev_loc=("Location of development file or directory"),
model_dir=("Location of output model directory",),
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
n_sents=("Number of training sentences", "option", "n", int),
n_iter=("Number of training iterations", "option", "i", int),
verbose=("Verbose error reporting", "flag", "v", bool),
nv_word=("Word vector length", "option", "W", int),
nv_tag=("Tag vector length", "option", "T", int),
nv_label=("Label vector length", "option", "L", int),
nv_hidden=("Hidden nodes length", "option", "H", int),
eta=("Learning rate", "option", "E", float),
mu=("Momentum", "option", "M", float),
)
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False,
nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10,
eta=0.1, mu=0.9, eval_only=False):
gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id']))
nlp = train(English, gold_train, model_dir,
feat_set='embed',
eta=eta, mu=mu,
nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden,
n_sents=n_sents, n_iter=n_iter,
verbose=verbose)
scorer = evaluate(nlp, list(read_json_file(dev_loc)))
print 'TOK', 100-scorer.token_acc
print 'POS', scorer.tags_acc
print 'UAS', scorer.uas
print 'LAS', scorer.las
print 'NER P', scorer.ents_p
print 'NER R', scorer.ents_r
print 'NER F', scorer.ents_f
if __name__ == '__main__':
plac.call(main)

View File

@ -56,7 +56,8 @@ def get_version(model, comp):
def download_model(filename): def download_model(filename):
util.print_msg("Downloading {f}".format(f=filename)) util.print_msg("Downloading {f}".format(f=filename))
download_url = about.__download_url__ + '/' + filename download_url = about.__download_url__ + '/' + filename
subprocess.call([sys.executable, '-m', 'pip', 'install', download_url], subprocess.call([sys.executable, '-m',
'pip', 'install', '--no-cache-dir', download_url],
env=os.environ.copy()) env=os.environ.copy())

View File

@ -43,7 +43,14 @@ def symlink(model_path, link_name, force):
elif link_path.exists(): elif link_path.exists():
link_path.unlink() link_path.unlink()
link_path.symlink_to(model_path) # Add workaround for Python 2 on Windows (see issue #909)
if util.is_python2() and util.is_windows():
import subprocess
command = ['mklink', '/d', link_path.as_posix(), model_path.as_posix()]
subprocess.call(command, shell=True)
else:
link_path.symlink_to(model_path)
util.print_msg( util.print_msg(
"{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()), "{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),
"You can now load the model via spacy.load('{l}').".format(l=link_name), "You can now load the model via spacy.load('{l}').".format(l=link_name),

18
spacy/he/__init__.py Normal file
View File

@ -0,0 +1,18 @@
# encoding: utf8
from __future__ import unicode_literals, print_function
from ..language import Language
from ..attrs import LANG
from .language_data import *
class Hebrew(Language):
lang = 'he'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'he'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS

17
spacy/he/language_data.py Normal file
View File

@ -0,0 +1,17 @@
# encoding: utf8
from __future__ import unicode_literals
from .. import language_data as base
from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

226
spacy/he/stop_words.py Normal file
View File

@ -0,0 +1,226 @@
# encoding: utf8
from __future__ import unicode_literals
STOP_WORDS = set("""
אני
את
אתה
אנחנו
אתן
אתם
הם
הן
היא
הוא
שלי
שלו
שלך
שלה
שלנו
שלכם
שלכן
שלהם
שלהן
לי
לו
לה
לנו
לכם
לכן
להם
להן
אותה
אותו
זה
זאת
אלה
אלו
תחת
מתחת
מעל
בין
עם
עד
נגר
על
אל
מול
של
אצל
כמו
אחר
אותו
בלי
לפני
אחרי
מאחורי
עלי
עליו
עליה
עליך
עלינו
עליכם
לעיכן
עליהם
עליהן
כל
כולם
כולן
כך
ככה
כזה
זה
זות
אותי
אותה
אותם
אותך
אותו
אותן
אותנו
ואת
את
אתכם
אתכן
איתי
איתו
איתך
איתה
איתם
איתן
איתנו
איתכם
איתכן
יהיה
תהיה
היתי
היתה
היה
להיות
עצמי
עצמו
עצמה
עצמם
עצמן
עצמנו
עצמהם
עצמהן
מי
מה
איפה
היכן
במקום שבו
אם
לאן
למקום שבו
מקום בו
איזה
מהיכן
איך
כיצד
באיזו מידה
מתי
בשעה ש
כאשר
כש
למרות
לפני
אחרי
מאיזו סיבה
הסיבה שבגללה
למה
מדוע
לאיזו תכלית
כי
יש
אין
אך
מנין
מאין
מאיפה
יכל
יכלה
יכלו
יכול
יכולה
יכולים
יכולות
יוכלו
יוכל
מסוגל
לא
רק
אולי
אין
לאו
אי
כלל
נגד
אם
עם
אל
אלה
אלו
אף
על
מעל
מתחת
מצד
בשביל
לבין
באמצע
בתוך
דרך
מבעד
באמצעות
למעלה
למטה
מחוץ
מן
לעבר
מכאן
כאן
הנה
הרי
פה
שם
אך
ברם
שוב
אבל
מבלי
בלי
מלבד
רק
בגלל
מכיוון
עד
אשר
ואילו
למרות
אס
כמו
כפי
אז
אחרי
כן
לכן
לפיכך
מאד
עז
מעט
מעטים
במידה
שוב
יותר
מדי
גם
כן
נו
אחר
אחרת
אחרים
אחרות
אשר
או
""".split())

View File

@ -7,6 +7,8 @@ import ujson as json
from .en.lemmatizer import INDEX, EXC, RULES from .en.lemmatizer import INDEX, EXC, RULES
from .symbols import POS, NOUN, VERB, ADJ, PUNCT from .symbols import POS, NOUN, VERB, ADJ, PUNCT
from .symbols import VerbForm_inf, VerbForm_none from .symbols import VerbForm_inf, VerbForm_none
from .symbols import Number_sing
from .symbols import Degree_pos
class Lemmatizer(object): class Lemmatizer(object):
@ -45,11 +47,20 @@ class Lemmatizer(object):
morphology = {} if morphology is None else morphology morphology = {} if morphology is None else morphology
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
true_morph_key = morphology.get('morph', 0) true_morph_key = morphology.get('morph', 0)
if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others: print(univ_pos, morphology)
if univ_pos == 'noun' and morphology.get('Number') == 'sing':
return True return True
elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others: elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
return True return True
elif true_morph_key in (VerbForm_inf, VerbForm_none): elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
return True
elif VerbForm_inf in morphology:
return True
elif VerbForm_none in morphology:
return True
elif Number_sing in morphology:
return True
elif Degree_pos in morphology:
return True return True
else: else:
return False return False

View File

@ -25,6 +25,8 @@ def _normalize_props(props):
if value in POS_IDS: if value in POS_IDS:
value = POS_IDS[value] value = POS_IDS[value]
out[key] = value out[key] = value
elif isinstance(key, int):
out[key] = value
elif key.lower() == 'pos': elif key.lower() == 'pos':
out[POS] = POS_IDS[value.upper()] out[POS] = POS_IDS[value.upper()]
else: else:
@ -32,12 +34,11 @@ def _normalize_props(props):
return out return out
cdef class Morphology: cdef class Morphology:
def __init__(self, StringStore string_store, tag_map, lemmatizer): def __init__(self, StringStore string_store, tag_map, lemmatizer):
self.mem = Pool() self.mem = Pool()
self.strings = string_store self.strings = string_store
self.tag_map = tag_map self.tag_map = {}
self.lemmatizer = lemmatizer self.lemmatizer = lemmatizer
self.n_tags = len(tag_map) + 1 self.n_tags = len(tag_map) + 1
self.tag_names = tuple(sorted(tag_map.keys())) self.tag_names = tuple(sorted(tag_map.keys()))
@ -46,6 +47,7 @@ cdef class Morphology:
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC)) self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
attrs = _normalize_props(attrs) attrs = _normalize_props(attrs)
self.tag_map[tag_str] = dict(attrs)
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self.rich_tags[i].id = i self.rich_tags[i].id = i
self.rich_tags[i].name = self.strings[tag_str] self.rich_tags[i].name = self.strings[tag_str]
@ -74,11 +76,12 @@ cdef class Morphology:
# Related to Issue #220 # Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE): if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings['SP']] tag_id = self.reverse_index[self.strings['SP']]
rich_tag = self.rich_tags[tag_id]
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth) analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
if analysis is NULL: if analysis is NULL:
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC)) analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
analysis.tag = self.rich_tags[tag_id]
tag_str = self.strings[self.rich_tags[tag_id].name] tag_str = self.strings[self.rich_tags[tag_id].name]
analysis.tag = rich_tag
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth, analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
self.tag_map.get(tag_str, {})) self.tag_map.get(tag_str, {}))
self._cache.set(tag_id, token.lex.orth, analysis) self._cache.set(tag_id, token.lex.orth, analysis)
@ -126,8 +129,7 @@ cdef class Morphology:
else: else:
self.assign_feature(&cached.tag.morph, name_id, value_id) self.assign_feature(&cached.tag.morph, name_id, value_id)
if cached.lemma == 0: if cached.lemma == 0:
cached.lemma = self.lemmatize(rich_tag.pos, orth, cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs)
self.tag_map.get(tag_str, {}))
self._cache.set(tag_id, orth, <void*>cached) self._cache.set(tag_id, orth, <void*>cached)
def load_morph_exceptions(self, dict exc): def load_morph_exceptions(self, dict exc):

View File

@ -19,6 +19,9 @@ cdef class EntityRecognizer(Parser):
def add_label(self, label): def add_label(self, label):
for action in self.moves.action_types: for action in self.moves.action_types:
self.moves.add_action(action, label) self.moves.add_action(action, label)
if 'actions' in self.cfg:
self.cfg['actions'].setdefault(action,
{}).setdefault(label, True)
if isinstance(label, basestring): if isinstance(label, basestring):
label = self.vocab.strings[label] label = self.vocab.strings[label]
for attr, freqs in self.vocab.serializer_freqs: for attr, freqs in self.vocab.serializer_freqs:
@ -37,6 +40,9 @@ cdef class BeamEntityRecognizer(BeamParser):
def add_label(self, label): def add_label(self, label):
for action in self.moves.action_types: for action in self.moves.action_types:
self.moves.add_action(action, label) self.moves.add_action(action, label)
if 'actions' in self.cfg:
self.cfg['actions'].setdefault(action,
{}).setdefault(label, True)
if isinstance(label, basestring): if isinstance(label, basestring):
label = self.vocab.strings[label] label = self.vocab.strings[label]
for attr, freqs in self.vocab.serializer_freqs: for attr, freqs in self.vocab.serializer_freqs:
@ -54,6 +60,9 @@ cdef class DependencyParser(Parser):
def add_label(self, label): def add_label(self, label):
for action in self.moves.action_types: for action in self.moves.action_types:
self.moves.add_action(action, label) self.moves.add_action(action, label)
if 'actions' in self.cfg:
self.cfg['actions'].setdefault(action,
{}).setdefault(label, True)
if isinstance(label, basestring): if isinstance(label, basestring):
label = self.vocab.strings[label] label = self.vocab.strings[label]
for attr, freqs in self.vocab.serializer_freqs: for attr, freqs in self.vocab.serializer_freqs:
@ -71,6 +80,9 @@ cdef class BeamDependencyParser(BeamParser):
def add_label(self, label): def add_label(self, label):
for action in self.moves.action_types: for action in self.moves.action_types:
self.moves.add_action(action, label) self.moves.add_action(action, label)
if 'actions' in self.cfg:
self.cfg['actions'].setdefault(action,
{}).setdefault(label, True)
if isinstance(label, basestring): if isinstance(label, basestring):
label = self.vocab.strings[label] label = self.vocab.strings[label]
for attr, freqs in self.vocab.serializer_freqs: for attr, freqs in self.vocab.serializer_freqs:

View File

@ -12,6 +12,8 @@ from ..sv import Swedish
from ..hu import Hungarian from ..hu import Hungarian
from ..fi import Finnish from ..fi import Finnish
from ..bn import Bengali from ..bn import Bengali
from ..he import Hebrew
from ..tokens import Doc from ..tokens import Doc
from ..strings import StringStore from ..strings import StringStore
from ..lemmatizer import Lemmatizer from ..lemmatizer import Lemmatizer
@ -78,6 +80,11 @@ def bn_tokenizer():
return Bengali.Defaults.create_tokenizer() return Bengali.Defaults.create_tokenizer()
@pytest.fixture
def he_tokenizer():
return Hebrew.Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def stringstore(): def stringstore():
return StringStore() return StringStore()

View File

View File

@ -0,0 +1,17 @@
# encoding: utf8
from __future__ import unicode_literals
import pytest
ABBREVIATION_TESTS = [
('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית'])
]
TESTCASES = ABBREVIATION_TESTS
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
def test_tokenizer_handles_testcases(he_tokenizer, text, expected_tokens):
tokens = he_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

@ -12,7 +12,7 @@ import pytest
def test_issue595(): def test_issue595():
"""Test lemmatization of base forms""" """Test lemmatization of base forms"""
words = ["Do", "n't", "feed", "the", "dog"] words = ["Do", "n't", "feed", "the", "dog"]
tag_map = {'VB': {POS: VERB, 'morph': VerbForm_inf}} tag_map = {'VB': {POS: VERB, VerbForm_inf: True}}
rules = {"verb": [["ed", "e"]]} rules = {"verb": [["ed", "e"]]}
lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules) lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules)

View File

@ -0,0 +1,16 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from ...tokens import Doc
@pytest.mark.parametrize('text,tag,lemma',
[("anus", "NN", "anus"),
("princess", "NN", "princess")])
def test_issue912(en_vocab, text, tag, lemma):
'''Test base-forms of adjectives are preserved.'''
doc = Doc(en_vocab, words=[text])
doc[0].tag_ = tag
assert doc[0].lemma_ == lemma

View File

@ -0,0 +1,113 @@
from __future__ import unicode_literals
import json
import os
import random
import contextlib
import shutil
import pytest
import tempfile
from pathlib import Path
import pathlib
from ...gold import GoldParse
from ...pipeline import EntityRecognizer
from ...en import English
try:
unicode
except NameError:
unicode = str
@pytest.fixture
def train_data():
return [
["hey",[]],
["howdy",[]],
["hey there",[]],
["hello",[]],
["hi",[]],
["i'm looking for a place to eat",[]],
["i'm looking for a place in the north of town",[[31,36,"location"]]],
["show me chinese restaurants",[[8,15,"cuisine"]]],
["show me chines restaurants",[[8,14,"cuisine"]]],
["yes",[]],
["yep",[]],
["yeah",[]],
["show me a mexican place in the centre",[[31,37,"location"], [10,17,"cuisine"]]],
["bye",[]],["goodbye",[]],
["good bye",[]],
["stop",[]],
["end",[]],
["i am looking for an indian spot",[[20,26,"cuisine"]]],
["search for restaurants",[]],
["anywhere in the west",[[16,20,"location"]]],
["central indian restaurant",[[0,7,"location"],[8,14,"cuisine"]]],
["indeed",[]],
["that's right",[]],
["ok",[]],
["great",[]]
]
@pytest.fixture
def additional_entity_types():
return ['cuisine', 'location']
@contextlib.contextmanager
def temp_save_model(model):
model_dir = Path(tempfile.mkdtemp())
# store the fine tuned model
with (model_dir / "config.json").open('w') as file_:
data = json.dumps(model.cfg)
if not isinstance(data, unicode):
data = data.decode('utf8')
file_.write(data)
model.model.dump((model_dir / 'model').as_posix())
yield model_dir
shutil.rmtree(model_dir.as_posix())
@pytest.mark.xfail
@pytest.mark.models
def test_issue910(train_data, additional_entity_types):
'''Test that adding entities and resuming training works passably OK.
There are two issues here:
1) We have to readd labels. This isn't very nice.
2) There's no way to set the learning rate for the weight update, so we
end up out-of-scale, causing it to learn too fast.
'''
nlp = English()
doc = nlp(u"I am looking for a restaurant in Berlin")
ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
# Fine tune the ner model
for entity_type in additional_entity_types:
if entity_type not in nlp.entity.cfg['actions']['1']:
nlp.entity.add_label(entity_type)
nlp.entity.learn_rate = 0.001
for itn in range(4):
random.shuffle(train_data)
for raw_text, entity_offsets in train_data:
doc = nlp.make_doc(raw_text)
nlp.tagger(doc)
gold = GoldParse(doc, entities=entity_offsets)
loss = nlp.entity.update(doc, gold)
with temp_save_model(nlp.entity) as model_dir:
# Load the fine tuned model
loaded_ner = EntityRecognizer.load(model_dir, nlp.vocab)
for entity_type in additional_entity_types:
if entity_type not in loaded_ner.cfg['actions']['1']:
loaded_ner.add_label(entity_type)
doc = nlp(u"I am looking for a restaurant in Berlin", entity=False)
nlp.tagger(doc)
loaded_ner(doc)
ents_after_train = [(ent.label_, ent.text) for ent in doc.ents]
assert ents_before_train == ents_after_train

View File

@ -0,0 +1,14 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from ...tokens import Doc
@pytest.mark.parametrize('text,tag,lemma', [("inner", "JJ", "inner")])
def test_issue912(en_vocab, text, tag, lemma):
'''Test base-forms of adjectives are preserved.'''
doc = Doc(en_vocab, words=[text])
doc[0].tag_ = tag
assert doc[0].lemma_ == lemma

View File

@ -153,6 +153,16 @@ def check_renamed_kwargs(renamed, kwargs):
raise TypeError("Keyword argument %s now renamed to %s" % (old, new)) raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
def is_windows():
"""Check if user is on Windows."""
return sys.platform.startswith('win')
def is_python2():
"""Check if Python 2 is used."""
return sys.version.startswith('2.')
def parse_package_meta(package_path, package, require=True): def parse_package_meta(package_path, package, require=True):
location = os.path.join(str(package_path), package, 'meta.json') location = os.path.join(str(package_path), package, 'meta.json')
if os.path.isfile(location): if os.path.isfile(location):

View File

@ -82,7 +82,7 @@ p
| conjunction features out of the atomic predictors. Let's say you have | conjunction features out of the atomic predictors. Let's say you have
| two atomic predictors asking, "What is the part-of-speech of the | two atomic predictors asking, "What is the part-of-speech of the
| previous token?", and "What is the part-of-speech of the previous | previous token?", and "What is the part-of-speech of the previous
| previous token?". These ppredictors will introduce a number of features, | previous token?". These predictors will introduce a number of features,
| e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction | e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction
| template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ]. | template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ].