mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Merge branch 'master' into develop
This commit is contained in:
commit
5eac089fbe
|
@ -1,130 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import codecs
|
|
||||||
import random
|
|
||||||
import time
|
|
||||||
import gzip
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import cProfile
|
|
||||||
import pstats
|
|
||||||
|
|
||||||
import spacy.util
|
|
||||||
from spacy.en import English
|
|
||||||
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
|
|
||||||
|
|
||||||
from spacy.syntax.parser import GreedyParser
|
|
||||||
from spacy.syntax.parser import OracleError
|
|
||||||
from spacy.syntax.util import Config
|
|
||||||
|
|
||||||
|
|
||||||
def is_punct_label(label):
|
|
||||||
return label == 'P' or label.lower() == 'punct'
|
|
||||||
|
|
||||||
|
|
||||||
def read_gold(file_):
|
|
||||||
"""Read a standard CoNLL/MALT-style format"""
|
|
||||||
sents = []
|
|
||||||
for sent_str in file_.read().strip().split('\n\n'):
|
|
||||||
ids = []
|
|
||||||
words = []
|
|
||||||
heads = []
|
|
||||||
labels = []
|
|
||||||
tags = []
|
|
||||||
for i, line in enumerate(sent_str.split('\n')):
|
|
||||||
id_, word, pos_string, head_idx, label = _parse_line(line)
|
|
||||||
words.append(word)
|
|
||||||
if head_idx == -1:
|
|
||||||
head_idx = i
|
|
||||||
ids.append(id_)
|
|
||||||
heads.append(head_idx)
|
|
||||||
labels.append(label)
|
|
||||||
tags.append(pos_string)
|
|
||||||
text = ' '.join(words)
|
|
||||||
sents.append((text, [words], ids, words, tags, heads, labels))
|
|
||||||
return sents
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_line(line):
|
|
||||||
pieces = line.split()
|
|
||||||
id_ = int(pieces[0])
|
|
||||||
word = pieces[1]
|
|
||||||
pos = pieces[3]
|
|
||||||
head_idx = int(pieces[6])
|
|
||||||
label = pieces[7]
|
|
||||||
return id_, word, pos, head_idx, label
|
|
||||||
|
|
||||||
|
|
||||||
def iter_data(paragraphs, tokenizer, gold_preproc=False):
|
|
||||||
for raw, tokenized, ids, words, tags, heads, labels in paragraphs:
|
|
||||||
assert len(words) == len(heads)
|
|
||||||
for words in tokenized:
|
|
||||||
sent_ids = ids[:len(words)]
|
|
||||||
sent_tags = tags[:len(words)]
|
|
||||||
sent_heads = heads[:len(words)]
|
|
||||||
sent_labels = labels[:len(words)]
|
|
||||||
sent_heads = _map_indices_to_tokens(sent_ids, sent_heads)
|
|
||||||
tokens = tokenizer.tokens_from_list(words)
|
|
||||||
yield tokens, sent_tags, sent_heads, sent_labels
|
|
||||||
ids = ids[len(words):]
|
|
||||||
tags = tags[len(words):]
|
|
||||||
heads = heads[len(words):]
|
|
||||||
labels = labels[len(words):]
|
|
||||||
|
|
||||||
|
|
||||||
def _map_indices_to_tokens(ids, heads):
|
|
||||||
mapped = []
|
|
||||||
for head in heads:
|
|
||||||
if head not in ids:
|
|
||||||
mapped.append(None)
|
|
||||||
else:
|
|
||||||
mapped.append(ids.index(head))
|
|
||||||
return mapped
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(Language, dev_loc, model_dir):
|
|
||||||
global loss
|
|
||||||
nlp = Language()
|
|
||||||
n_corr = 0
|
|
||||||
pos_corr = 0
|
|
||||||
n_tokens = 0
|
|
||||||
total = 0
|
|
||||||
skipped = 0
|
|
||||||
loss = 0
|
|
||||||
with codecs.open(dev_loc, 'r', 'utf8') as file_:
|
|
||||||
paragraphs = read_gold(file_)
|
|
||||||
for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer):
|
|
||||||
assert len(tokens) == len(labels)
|
|
||||||
nlp.tagger.tag_from_strings(tokens, tag_strs)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
for i, token in enumerate(tokens):
|
|
||||||
try:
|
|
||||||
pos_corr += token.tag_ == tag_strs[i]
|
|
||||||
except:
|
|
||||||
print i, token.orth_, token.tag
|
|
||||||
raise
|
|
||||||
n_tokens += 1
|
|
||||||
if heads[i] is None:
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
if is_punct_label(labels[i]):
|
|
||||||
continue
|
|
||||||
n_corr += token.head.i == heads[i]
|
|
||||||
total += 1
|
|
||||||
print loss, skipped, (loss+skipped + total)
|
|
||||||
print pos_corr / n_tokens
|
|
||||||
return float(n_corr) / (total + loss)
|
|
||||||
|
|
||||||
|
|
||||||
def main(dev_loc, model_dir):
|
|
||||||
print evaluate(English, dev_loc, model_dir)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,261 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import codecs
|
|
||||||
import random
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import cProfile
|
|
||||||
import pstats
|
|
||||||
import re
|
|
||||||
|
|
||||||
import spacy.util
|
|
||||||
from spacy.en import English
|
|
||||||
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
|
|
||||||
|
|
||||||
from spacy.syntax.util import Config
|
|
||||||
from spacy.gold import read_json_file
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
|
|
||||||
from spacy.scorer import Scorer
|
|
||||||
|
|
||||||
from spacy.syntax.parser import Parser, get_templates
|
|
||||||
from spacy._theano import TheanoModel
|
|
||||||
|
|
||||||
import theano
|
|
||||||
import theano.tensor as T
|
|
||||||
|
|
||||||
from theano.printing import Print
|
|
||||||
|
|
||||||
import numpy
|
|
||||||
from collections import OrderedDict, defaultdict
|
|
||||||
|
|
||||||
|
|
||||||
theano.config.profile = False
|
|
||||||
theano.config.floatX = 'float32'
|
|
||||||
floatX = theano.config.floatX
|
|
||||||
|
|
||||||
|
|
||||||
def L1(L1_reg, *weights):
|
|
||||||
return L1_reg * sum(abs(w).sum() for w in weights)
|
|
||||||
|
|
||||||
|
|
||||||
def L2(L2_reg, *weights):
|
|
||||||
return L2_reg * sum((w ** 2).sum() for w in weights)
|
|
||||||
|
|
||||||
|
|
||||||
def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6):
|
|
||||||
updates = OrderedDict()
|
|
||||||
for param in params:
|
|
||||||
value = param.get_value(borrow=True)
|
|
||||||
accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
|
|
||||||
broadcastable=param.broadcastable)
|
|
||||||
|
|
||||||
grad = T.grad(loss, param)
|
|
||||||
accu_new = rho * accu + (1 - rho) * grad ** 2
|
|
||||||
updates[accu] = accu_new
|
|
||||||
updates[param] = param - (eta * grad / T.sqrt(accu_new + eps))
|
|
||||||
return updates
|
|
||||||
|
|
||||||
|
|
||||||
def relu(x):
|
|
||||||
return x * (x > 0)
|
|
||||||
|
|
||||||
|
|
||||||
def feed_layer(activation, weights, bias, input_):
|
|
||||||
return activation(T.dot(input_, weights) + bias)
|
|
||||||
|
|
||||||
|
|
||||||
def init_weights(n_in, n_out):
|
|
||||||
rng = numpy.random.RandomState(1235)
|
|
||||||
|
|
||||||
weights = numpy.asarray(
|
|
||||||
rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in),
|
|
||||||
dtype=theano.config.floatX
|
|
||||||
)
|
|
||||||
bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
|
|
||||||
return [wrapper(weights, name='W'), wrapper(bias, name='b')]
|
|
||||||
|
|
||||||
|
|
||||||
def compile_model(n_classes, n_hidden, n_in, optimizer):
|
|
||||||
x = T.vector('x')
|
|
||||||
costs = T.ivector('costs')
|
|
||||||
loss = T.scalar('loss')
|
|
||||||
|
|
||||||
maxent_W, maxent_b = init_weights(n_hidden, n_classes)
|
|
||||||
hidden_W, hidden_b = init_weights(n_in, n_hidden)
|
|
||||||
|
|
||||||
# Feed the inputs forward through the network
|
|
||||||
p_y_given_x = feed_layer(
|
|
||||||
T.nnet.softmax,
|
|
||||||
maxent_W,
|
|
||||||
maxent_b,
|
|
||||||
feed_layer(
|
|
||||||
relu,
|
|
||||||
hidden_W,
|
|
||||||
hidden_b,
|
|
||||||
x))
|
|
||||||
|
|
||||||
loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8)
|
|
||||||
|
|
||||||
train_model = theano.function(
|
|
||||||
name='train_model',
|
|
||||||
inputs=[x, costs],
|
|
||||||
outputs=[p_y_given_x[0], T.grad(loss, x), loss],
|
|
||||||
updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]),
|
|
||||||
on_unused_input='warn'
|
|
||||||
)
|
|
||||||
|
|
||||||
evaluate_model = theano.function(
|
|
||||||
name='evaluate_model',
|
|
||||||
inputs=[x],
|
|
||||||
outputs=[
|
|
||||||
feed_layer(
|
|
||||||
T.nnet.softmax,
|
|
||||||
maxent_W,
|
|
||||||
maxent_b,
|
|
||||||
feed_layer(
|
|
||||||
relu,
|
|
||||||
hidden_W,
|
|
||||||
hidden_b,
|
|
||||||
x
|
|
||||||
)
|
|
||||||
)[0]
|
|
||||||
]
|
|
||||||
)
|
|
||||||
return train_model, evaluate_model
|
|
||||||
|
|
||||||
|
|
||||||
def score_model(scorer, nlp, annot_tuples, verbose=False):
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
|
||||||
scorer.score(tokens, gold, verbose=verbose)
|
|
||||||
|
|
||||||
|
|
||||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|
||||||
eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
|
|
||||||
seed=0, n_sents=0, verbose=False):
|
|
||||||
|
|
||||||
dep_model_dir = path.join(model_dir, 'deps')
|
|
||||||
pos_model_dir = path.join(model_dir, 'pos')
|
|
||||||
if path.exists(dep_model_dir):
|
|
||||||
shutil.rmtree(dep_model_dir)
|
|
||||||
if path.exists(pos_model_dir):
|
|
||||||
shutil.rmtree(pos_model_dir)
|
|
||||||
os.mkdir(dep_model_dir)
|
|
||||||
os.mkdir(pos_model_dir)
|
|
||||||
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
|
|
||||||
|
|
||||||
Config.write(dep_model_dir, 'config',
|
|
||||||
seed=seed,
|
|
||||||
templates=tuple(),
|
|
||||||
labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
|
|
||||||
vector_lengths=(nv_word, nv_tag, nv_label),
|
|
||||||
hidden_nodes=nv_hidden,
|
|
||||||
eta=eta,
|
|
||||||
mu=mu
|
|
||||||
)
|
|
||||||
|
|
||||||
# Bake-in hyper-parameters
|
|
||||||
optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps)
|
|
||||||
nlp = Language(data_dir=model_dir)
|
|
||||||
n_classes = nlp.parser.model.n_classes
|
|
||||||
train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer)
|
|
||||||
nlp.parser.model = TheanoModel(n_classes, input_spec, train,
|
|
||||||
predict, model_loc)
|
|
||||||
|
|
||||||
if n_sents > 0:
|
|
||||||
gold_tuples = gold_tuples[:n_sents]
|
|
||||||
print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
|
|
||||||
log_loc = path.join(model_dir, 'job.log')
|
|
||||||
for itn in range(n_iter):
|
|
||||||
scorer = Scorer()
|
|
||||||
loss = 0
|
|
||||||
for _, sents in gold_tuples:
|
|
||||||
for annot_tuples, ctnt in sents:
|
|
||||||
if len(annot_tuples[1]) == 1:
|
|
||||||
continue
|
|
||||||
score_model(scorer, nlp, annot_tuples)
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
gold = GoldParse(tokens, annot_tuples, make_projective=True)
|
|
||||||
assert gold.is_projective
|
|
||||||
loss += nlp.parser.train(tokens, gold)
|
|
||||||
nlp.tagger.train(tokens, gold.tags)
|
|
||||||
random.shuffle(gold_tuples)
|
|
||||||
logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
|
|
||||||
scorer.tags_acc,
|
|
||||||
scorer.token_acc)
|
|
||||||
print logline
|
|
||||||
with open(log_loc, 'aw') as file_:
|
|
||||||
file_.write(logline + '\n')
|
|
||||||
nlp.parser.model.end_training()
|
|
||||||
nlp.tagger.model.end_training()
|
|
||||||
nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(nlp, gold_tuples, gold_preproc=True):
|
|
||||||
scorer = Scorer()
|
|
||||||
for raw_text, sents in gold_tuples:
|
|
||||||
for annot_tuples, brackets in sents:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
|
||||||
scorer.score(tokens, gold)
|
|
||||||
return scorer
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
train_loc=("Location of training file or directory"),
|
|
||||||
dev_loc=("Location of development file or directory"),
|
|
||||||
model_dir=("Location of output model directory",),
|
|
||||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
|
||||||
n_sents=("Number of training sentences", "option", "n", int),
|
|
||||||
n_iter=("Number of training iterations", "option", "i", int),
|
|
||||||
verbose=("Verbose error reporting", "flag", "v", bool),
|
|
||||||
|
|
||||||
nv_word=("Word vector length", "option", "W", int),
|
|
||||||
nv_tag=("Tag vector length", "option", "T", int),
|
|
||||||
nv_label=("Label vector length", "option", "L", int),
|
|
||||||
nv_hidden=("Hidden nodes length", "option", "H", int),
|
|
||||||
eta=("Learning rate", "option", "E", float),
|
|
||||||
mu=("Momentum", "option", "M", float),
|
|
||||||
)
|
|
||||||
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False,
|
|
||||||
nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10,
|
|
||||||
eta=0.1, mu=0.9, eval_only=False):
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id']))
|
|
||||||
|
|
||||||
nlp = train(English, gold_train, model_dir,
|
|
||||||
feat_set='embed',
|
|
||||||
eta=eta, mu=mu,
|
|
||||||
nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden,
|
|
||||||
n_sents=n_sents, n_iter=n_iter,
|
|
||||||
verbose=verbose)
|
|
||||||
|
|
||||||
scorer = evaluate(nlp, list(read_json_file(dev_loc)))
|
|
||||||
|
|
||||||
print 'TOK', 100-scorer.token_acc
|
|
||||||
print 'POS', scorer.tags_acc
|
|
||||||
print 'UAS', scorer.uas
|
|
||||||
print 'LAS', scorer.las
|
|
||||||
|
|
||||||
print 'NER P', scorer.ents_p
|
|
||||||
print 'NER R', scorer.ents_r
|
|
||||||
print 'NER F', scorer.ents_f
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -56,7 +56,8 @@ def get_version(model, comp):
|
||||||
def download_model(filename):
|
def download_model(filename):
|
||||||
util.print_msg("Downloading {f}".format(f=filename))
|
util.print_msg("Downloading {f}".format(f=filename))
|
||||||
download_url = about.__download_url__ + '/' + filename
|
download_url = about.__download_url__ + '/' + filename
|
||||||
subprocess.call([sys.executable, '-m', 'pip', 'install', download_url],
|
subprocess.call([sys.executable, '-m',
|
||||||
|
'pip', 'install', '--no-cache-dir', download_url],
|
||||||
env=os.environ.copy())
|
env=os.environ.copy())
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -43,7 +43,14 @@ def symlink(model_path, link_name, force):
|
||||||
elif link_path.exists():
|
elif link_path.exists():
|
||||||
link_path.unlink()
|
link_path.unlink()
|
||||||
|
|
||||||
link_path.symlink_to(model_path)
|
# Add workaround for Python 2 on Windows (see issue #909)
|
||||||
|
if util.is_python2() and util.is_windows():
|
||||||
|
import subprocess
|
||||||
|
command = ['mklink', '/d', link_path.as_posix(), model_path.as_posix()]
|
||||||
|
subprocess.call(command, shell=True)
|
||||||
|
else:
|
||||||
|
link_path.symlink_to(model_path)
|
||||||
|
|
||||||
util.print_msg(
|
util.print_msg(
|
||||||
"{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),
|
"{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),
|
||||||
"You can now load the model via spacy.load('{l}').".format(l=link_name),
|
"You can now load the model via spacy.load('{l}').".format(l=link_name),
|
||||||
|
|
18
spacy/he/__init__.py
Normal file
18
spacy/he/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
from ..language import Language
|
||||||
|
from ..attrs import LANG
|
||||||
|
|
||||||
|
from .language_data import *
|
||||||
|
|
||||||
|
|
||||||
|
class Hebrew(Language):
|
||||||
|
lang = 'he'
|
||||||
|
|
||||||
|
class Defaults(Language.Defaults):
|
||||||
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters[LANG] = lambda text: 'he'
|
||||||
|
|
||||||
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
stop_words = STOP_WORDS
|
17
spacy/he/language_data.py
Normal file
17
spacy/he/language_data.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .. import language_data as base
|
||||||
|
from ..language_data import update_exc, strings_to_exc
|
||||||
|
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
STOP_WORDS = set(STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
226
spacy/he/stop_words.py
Normal file
226
spacy/he/stop_words.py
Normal file
|
@ -0,0 +1,226 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
STOP_WORDS = set("""
|
||||||
|
אני
|
||||||
|
את
|
||||||
|
אתה
|
||||||
|
אנחנו
|
||||||
|
אתן
|
||||||
|
אתם
|
||||||
|
הם
|
||||||
|
הן
|
||||||
|
היא
|
||||||
|
הוא
|
||||||
|
שלי
|
||||||
|
שלו
|
||||||
|
שלך
|
||||||
|
שלה
|
||||||
|
שלנו
|
||||||
|
שלכם
|
||||||
|
שלכן
|
||||||
|
שלהם
|
||||||
|
שלהן
|
||||||
|
לי
|
||||||
|
לו
|
||||||
|
לה
|
||||||
|
לנו
|
||||||
|
לכם
|
||||||
|
לכן
|
||||||
|
להם
|
||||||
|
להן
|
||||||
|
אותה
|
||||||
|
אותו
|
||||||
|
זה
|
||||||
|
זאת
|
||||||
|
אלה
|
||||||
|
אלו
|
||||||
|
תחת
|
||||||
|
מתחת
|
||||||
|
מעל
|
||||||
|
בין
|
||||||
|
עם
|
||||||
|
עד
|
||||||
|
נגר
|
||||||
|
על
|
||||||
|
אל
|
||||||
|
מול
|
||||||
|
של
|
||||||
|
אצל
|
||||||
|
כמו
|
||||||
|
אחר
|
||||||
|
אותו
|
||||||
|
בלי
|
||||||
|
לפני
|
||||||
|
אחרי
|
||||||
|
מאחורי
|
||||||
|
עלי
|
||||||
|
עליו
|
||||||
|
עליה
|
||||||
|
עליך
|
||||||
|
עלינו
|
||||||
|
עליכם
|
||||||
|
לעיכן
|
||||||
|
עליהם
|
||||||
|
עליהן
|
||||||
|
כל
|
||||||
|
כולם
|
||||||
|
כולן
|
||||||
|
כך
|
||||||
|
ככה
|
||||||
|
כזה
|
||||||
|
זה
|
||||||
|
זות
|
||||||
|
אותי
|
||||||
|
אותה
|
||||||
|
אותם
|
||||||
|
אותך
|
||||||
|
אותו
|
||||||
|
אותן
|
||||||
|
אותנו
|
||||||
|
ואת
|
||||||
|
את
|
||||||
|
אתכם
|
||||||
|
אתכן
|
||||||
|
איתי
|
||||||
|
איתו
|
||||||
|
איתך
|
||||||
|
איתה
|
||||||
|
איתם
|
||||||
|
איתן
|
||||||
|
איתנו
|
||||||
|
איתכם
|
||||||
|
איתכן
|
||||||
|
יהיה
|
||||||
|
תהיה
|
||||||
|
היתי
|
||||||
|
היתה
|
||||||
|
היה
|
||||||
|
להיות
|
||||||
|
עצמי
|
||||||
|
עצמו
|
||||||
|
עצמה
|
||||||
|
עצמם
|
||||||
|
עצמן
|
||||||
|
עצמנו
|
||||||
|
עצמהם
|
||||||
|
עצמהן
|
||||||
|
מי
|
||||||
|
מה
|
||||||
|
איפה
|
||||||
|
היכן
|
||||||
|
במקום שבו
|
||||||
|
אם
|
||||||
|
לאן
|
||||||
|
למקום שבו
|
||||||
|
מקום בו
|
||||||
|
איזה
|
||||||
|
מהיכן
|
||||||
|
איך
|
||||||
|
כיצד
|
||||||
|
באיזו מידה
|
||||||
|
מתי
|
||||||
|
בשעה ש
|
||||||
|
כאשר
|
||||||
|
כש
|
||||||
|
למרות
|
||||||
|
לפני
|
||||||
|
אחרי
|
||||||
|
מאיזו סיבה
|
||||||
|
הסיבה שבגללה
|
||||||
|
למה
|
||||||
|
מדוע
|
||||||
|
לאיזו תכלית
|
||||||
|
כי
|
||||||
|
יש
|
||||||
|
אין
|
||||||
|
אך
|
||||||
|
מנין
|
||||||
|
מאין
|
||||||
|
מאיפה
|
||||||
|
יכל
|
||||||
|
יכלה
|
||||||
|
יכלו
|
||||||
|
יכול
|
||||||
|
יכולה
|
||||||
|
יכולים
|
||||||
|
יכולות
|
||||||
|
יוכלו
|
||||||
|
יוכל
|
||||||
|
מסוגל
|
||||||
|
לא
|
||||||
|
רק
|
||||||
|
אולי
|
||||||
|
אין
|
||||||
|
לאו
|
||||||
|
אי
|
||||||
|
כלל
|
||||||
|
נגד
|
||||||
|
אם
|
||||||
|
עם
|
||||||
|
אל
|
||||||
|
אלה
|
||||||
|
אלו
|
||||||
|
אף
|
||||||
|
על
|
||||||
|
מעל
|
||||||
|
מתחת
|
||||||
|
מצד
|
||||||
|
בשביל
|
||||||
|
לבין
|
||||||
|
באמצע
|
||||||
|
בתוך
|
||||||
|
דרך
|
||||||
|
מבעד
|
||||||
|
באמצעות
|
||||||
|
למעלה
|
||||||
|
למטה
|
||||||
|
מחוץ
|
||||||
|
מן
|
||||||
|
לעבר
|
||||||
|
מכאן
|
||||||
|
כאן
|
||||||
|
הנה
|
||||||
|
הרי
|
||||||
|
פה
|
||||||
|
שם
|
||||||
|
אך
|
||||||
|
ברם
|
||||||
|
שוב
|
||||||
|
אבל
|
||||||
|
מבלי
|
||||||
|
בלי
|
||||||
|
מלבד
|
||||||
|
רק
|
||||||
|
בגלל
|
||||||
|
מכיוון
|
||||||
|
עד
|
||||||
|
אשר
|
||||||
|
ואילו
|
||||||
|
למרות
|
||||||
|
אס
|
||||||
|
כמו
|
||||||
|
כפי
|
||||||
|
אז
|
||||||
|
אחרי
|
||||||
|
כן
|
||||||
|
לכן
|
||||||
|
לפיכך
|
||||||
|
מאד
|
||||||
|
עז
|
||||||
|
מעט
|
||||||
|
מעטים
|
||||||
|
במידה
|
||||||
|
שוב
|
||||||
|
יותר
|
||||||
|
מדי
|
||||||
|
גם
|
||||||
|
כן
|
||||||
|
נו
|
||||||
|
אחר
|
||||||
|
אחרת
|
||||||
|
אחרים
|
||||||
|
אחרות
|
||||||
|
אשר
|
||||||
|
או
|
||||||
|
""".split())
|
|
@ -7,6 +7,8 @@ import ujson as json
|
||||||
from .en.lemmatizer import INDEX, EXC, RULES
|
from .en.lemmatizer import INDEX, EXC, RULES
|
||||||
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
||||||
from .symbols import VerbForm_inf, VerbForm_none
|
from .symbols import VerbForm_inf, VerbForm_none
|
||||||
|
from .symbols import Number_sing
|
||||||
|
from .symbols import Degree_pos
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer(object):
|
class Lemmatizer(object):
|
||||||
|
@ -45,11 +47,20 @@ class Lemmatizer(object):
|
||||||
morphology = {} if morphology is None else morphology
|
morphology = {} if morphology is None else morphology
|
||||||
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
|
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
|
||||||
true_morph_key = morphology.get('morph', 0)
|
true_morph_key = morphology.get('morph', 0)
|
||||||
if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
|
print(univ_pos, morphology)
|
||||||
|
if univ_pos == 'noun' and morphology.get('Number') == 'sing':
|
||||||
return True
|
return True
|
||||||
elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
|
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
|
||||||
return True
|
return True
|
||||||
elif true_morph_key in (VerbForm_inf, VerbForm_none):
|
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
||||||
|
return True
|
||||||
|
elif VerbForm_inf in morphology:
|
||||||
|
return True
|
||||||
|
elif VerbForm_none in morphology:
|
||||||
|
return True
|
||||||
|
elif Number_sing in morphology:
|
||||||
|
return True
|
||||||
|
elif Degree_pos in morphology:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -25,6 +25,8 @@ def _normalize_props(props):
|
||||||
if value in POS_IDS:
|
if value in POS_IDS:
|
||||||
value = POS_IDS[value]
|
value = POS_IDS[value]
|
||||||
out[key] = value
|
out[key] = value
|
||||||
|
elif isinstance(key, int):
|
||||||
|
out[key] = value
|
||||||
elif key.lower() == 'pos':
|
elif key.lower() == 'pos':
|
||||||
out[POS] = POS_IDS[value.upper()]
|
out[POS] = POS_IDS[value.upper()]
|
||||||
else:
|
else:
|
||||||
|
@ -32,12 +34,11 @@ def _normalize_props(props):
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
def __init__(self, StringStore string_store, tag_map, lemmatizer):
|
def __init__(self, StringStore string_store, tag_map, lemmatizer):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.strings = string_store
|
self.strings = string_store
|
||||||
self.tag_map = tag_map
|
self.tag_map = {}
|
||||||
self.lemmatizer = lemmatizer
|
self.lemmatizer = lemmatizer
|
||||||
self.n_tags = len(tag_map) + 1
|
self.n_tags = len(tag_map) + 1
|
||||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||||
|
@ -46,6 +47,7 @@ cdef class Morphology:
|
||||||
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
|
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
|
||||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||||
attrs = _normalize_props(attrs)
|
attrs = _normalize_props(attrs)
|
||||||
|
self.tag_map[tag_str] = dict(attrs)
|
||||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||||
self.rich_tags[i].id = i
|
self.rich_tags[i].id = i
|
||||||
self.rich_tags[i].name = self.strings[tag_str]
|
self.rich_tags[i].name = self.strings[tag_str]
|
||||||
|
@ -74,11 +76,12 @@ cdef class Morphology:
|
||||||
# Related to Issue #220
|
# Related to Issue #220
|
||||||
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||||
tag_id = self.reverse_index[self.strings['SP']]
|
tag_id = self.reverse_index[self.strings['SP']]
|
||||||
|
rich_tag = self.rich_tags[tag_id]
|
||||||
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
||||||
if analysis is NULL:
|
if analysis is NULL:
|
||||||
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
||||||
analysis.tag = self.rich_tags[tag_id]
|
|
||||||
tag_str = self.strings[self.rich_tags[tag_id].name]
|
tag_str = self.strings[self.rich_tags[tag_id].name]
|
||||||
|
analysis.tag = rich_tag
|
||||||
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
|
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
|
||||||
self.tag_map.get(tag_str, {}))
|
self.tag_map.get(tag_str, {}))
|
||||||
self._cache.set(tag_id, token.lex.orth, analysis)
|
self._cache.set(tag_id, token.lex.orth, analysis)
|
||||||
|
@ -126,8 +129,7 @@ cdef class Morphology:
|
||||||
else:
|
else:
|
||||||
self.assign_feature(&cached.tag.morph, name_id, value_id)
|
self.assign_feature(&cached.tag.morph, name_id, value_id)
|
||||||
if cached.lemma == 0:
|
if cached.lemma == 0:
|
||||||
cached.lemma = self.lemmatize(rich_tag.pos, orth,
|
cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs)
|
||||||
self.tag_map.get(tag_str, {}))
|
|
||||||
self._cache.set(tag_id, orth, <void*>cached)
|
self._cache.set(tag_id, orth, <void*>cached)
|
||||||
|
|
||||||
def load_morph_exceptions(self, dict exc):
|
def load_morph_exceptions(self, dict exc):
|
||||||
|
|
|
@ -19,6 +19,9 @@ cdef class EntityRecognizer(Parser):
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
for action in self.moves.action_types:
|
for action in self.moves.action_types:
|
||||||
self.moves.add_action(action, label)
|
self.moves.add_action(action, label)
|
||||||
|
if 'actions' in self.cfg:
|
||||||
|
self.cfg['actions'].setdefault(action,
|
||||||
|
{}).setdefault(label, True)
|
||||||
if isinstance(label, basestring):
|
if isinstance(label, basestring):
|
||||||
label = self.vocab.strings[label]
|
label = self.vocab.strings[label]
|
||||||
for attr, freqs in self.vocab.serializer_freqs:
|
for attr, freqs in self.vocab.serializer_freqs:
|
||||||
|
@ -37,6 +40,9 @@ cdef class BeamEntityRecognizer(BeamParser):
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
for action in self.moves.action_types:
|
for action in self.moves.action_types:
|
||||||
self.moves.add_action(action, label)
|
self.moves.add_action(action, label)
|
||||||
|
if 'actions' in self.cfg:
|
||||||
|
self.cfg['actions'].setdefault(action,
|
||||||
|
{}).setdefault(label, True)
|
||||||
if isinstance(label, basestring):
|
if isinstance(label, basestring):
|
||||||
label = self.vocab.strings[label]
|
label = self.vocab.strings[label]
|
||||||
for attr, freqs in self.vocab.serializer_freqs:
|
for attr, freqs in self.vocab.serializer_freqs:
|
||||||
|
@ -54,6 +60,9 @@ cdef class DependencyParser(Parser):
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
for action in self.moves.action_types:
|
for action in self.moves.action_types:
|
||||||
self.moves.add_action(action, label)
|
self.moves.add_action(action, label)
|
||||||
|
if 'actions' in self.cfg:
|
||||||
|
self.cfg['actions'].setdefault(action,
|
||||||
|
{}).setdefault(label, True)
|
||||||
if isinstance(label, basestring):
|
if isinstance(label, basestring):
|
||||||
label = self.vocab.strings[label]
|
label = self.vocab.strings[label]
|
||||||
for attr, freqs in self.vocab.serializer_freqs:
|
for attr, freqs in self.vocab.serializer_freqs:
|
||||||
|
@ -71,6 +80,9 @@ cdef class BeamDependencyParser(BeamParser):
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
for action in self.moves.action_types:
|
for action in self.moves.action_types:
|
||||||
self.moves.add_action(action, label)
|
self.moves.add_action(action, label)
|
||||||
|
if 'actions' in self.cfg:
|
||||||
|
self.cfg['actions'].setdefault(action,
|
||||||
|
{}).setdefault(label, True)
|
||||||
if isinstance(label, basestring):
|
if isinstance(label, basestring):
|
||||||
label = self.vocab.strings[label]
|
label = self.vocab.strings[label]
|
||||||
for attr, freqs in self.vocab.serializer_freqs:
|
for attr, freqs in self.vocab.serializer_freqs:
|
||||||
|
|
|
@ -12,6 +12,8 @@ from ..sv import Swedish
|
||||||
from ..hu import Hungarian
|
from ..hu import Hungarian
|
||||||
from ..fi import Finnish
|
from ..fi import Finnish
|
||||||
from ..bn import Bengali
|
from ..bn import Bengali
|
||||||
|
from ..he import Hebrew
|
||||||
|
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..strings import StringStore
|
from ..strings import StringStore
|
||||||
from ..lemmatizer import Lemmatizer
|
from ..lemmatizer import Lemmatizer
|
||||||
|
@ -78,6 +80,11 @@ def bn_tokenizer():
|
||||||
return Bengali.Defaults.create_tokenizer()
|
return Bengali.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def he_tokenizer():
|
||||||
|
return Hebrew.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def stringstore():
|
def stringstore():
|
||||||
return StringStore()
|
return StringStore()
|
||||||
|
|
0
spacy/tests/he/__init__.py
Normal file
0
spacy/tests/he/__init__.py
Normal file
17
spacy/tests/he/test_tokenizer.py
Normal file
17
spacy/tests/he/test_tokenizer.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
ABBREVIATION_TESTS = [
|
||||||
|
('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית'])
|
||||||
|
]
|
||||||
|
|
||||||
|
TESTCASES = ABBREVIATION_TESTS
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
||||||
|
def test_tokenizer_handles_testcases(he_tokenizer, text, expected_tokens):
|
||||||
|
tokens = he_tokenizer(text)
|
||||||
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
assert expected_tokens == token_list
|
|
@ -12,7 +12,7 @@ import pytest
|
||||||
def test_issue595():
|
def test_issue595():
|
||||||
"""Test lemmatization of base forms"""
|
"""Test lemmatization of base forms"""
|
||||||
words = ["Do", "n't", "feed", "the", "dog"]
|
words = ["Do", "n't", "feed", "the", "dog"]
|
||||||
tag_map = {'VB': {POS: VERB, 'morph': VerbForm_inf}}
|
tag_map = {'VB': {POS: VERB, VerbForm_inf: True}}
|
||||||
rules = {"verb": [["ed", "e"]]}
|
rules = {"verb": [["ed", "e"]]}
|
||||||
|
|
||||||
lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules)
|
lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules)
|
||||||
|
|
16
spacy/tests/regression/test_issue903.py
Normal file
16
spacy/tests/regression/test_issue903.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from ...tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,tag,lemma',
|
||||||
|
[("anus", "NN", "anus"),
|
||||||
|
("princess", "NN", "princess")])
|
||||||
|
def test_issue912(en_vocab, text, tag, lemma):
|
||||||
|
'''Test base-forms of adjectives are preserved.'''
|
||||||
|
doc = Doc(en_vocab, words=[text])
|
||||||
|
doc[0].tag_ = tag
|
||||||
|
assert doc[0].lemma_ == lemma
|
||||||
|
|
113
spacy/tests/regression/test_issue910.py
Normal file
113
spacy/tests/regression/test_issue910.py
Normal file
|
@ -0,0 +1,113 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import contextlib
|
||||||
|
import shutil
|
||||||
|
import pytest
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
import pathlib
|
||||||
|
from ...gold import GoldParse
|
||||||
|
from ...pipeline import EntityRecognizer
|
||||||
|
from ...en import English
|
||||||
|
|
||||||
|
try:
|
||||||
|
unicode
|
||||||
|
except NameError:
|
||||||
|
unicode = str
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def train_data():
|
||||||
|
return [
|
||||||
|
["hey",[]],
|
||||||
|
["howdy",[]],
|
||||||
|
["hey there",[]],
|
||||||
|
["hello",[]],
|
||||||
|
["hi",[]],
|
||||||
|
["i'm looking for a place to eat",[]],
|
||||||
|
["i'm looking for a place in the north of town",[[31,36,"location"]]],
|
||||||
|
["show me chinese restaurants",[[8,15,"cuisine"]]],
|
||||||
|
["show me chines restaurants",[[8,14,"cuisine"]]],
|
||||||
|
["yes",[]],
|
||||||
|
["yep",[]],
|
||||||
|
["yeah",[]],
|
||||||
|
["show me a mexican place in the centre",[[31,37,"location"], [10,17,"cuisine"]]],
|
||||||
|
["bye",[]],["goodbye",[]],
|
||||||
|
["good bye",[]],
|
||||||
|
["stop",[]],
|
||||||
|
["end",[]],
|
||||||
|
["i am looking for an indian spot",[[20,26,"cuisine"]]],
|
||||||
|
["search for restaurants",[]],
|
||||||
|
["anywhere in the west",[[16,20,"location"]]],
|
||||||
|
["central indian restaurant",[[0,7,"location"],[8,14,"cuisine"]]],
|
||||||
|
["indeed",[]],
|
||||||
|
["that's right",[]],
|
||||||
|
["ok",[]],
|
||||||
|
["great",[]]
|
||||||
|
]
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def additional_entity_types():
|
||||||
|
return ['cuisine', 'location']
|
||||||
|
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def temp_save_model(model):
|
||||||
|
model_dir = Path(tempfile.mkdtemp())
|
||||||
|
# store the fine tuned model
|
||||||
|
with (model_dir / "config.json").open('w') as file_:
|
||||||
|
data = json.dumps(model.cfg)
|
||||||
|
if not isinstance(data, unicode):
|
||||||
|
data = data.decode('utf8')
|
||||||
|
file_.write(data)
|
||||||
|
model.model.dump((model_dir / 'model').as_posix())
|
||||||
|
yield model_dir
|
||||||
|
shutil.rmtree(model_dir.as_posix())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_issue910(train_data, additional_entity_types):
|
||||||
|
'''Test that adding entities and resuming training works passably OK.
|
||||||
|
There are two issues here:
|
||||||
|
|
||||||
|
1) We have to readd labels. This isn't very nice.
|
||||||
|
2) There's no way to set the learning rate for the weight update, so we
|
||||||
|
end up out-of-scale, causing it to learn too fast.
|
||||||
|
'''
|
||||||
|
nlp = English()
|
||||||
|
doc = nlp(u"I am looking for a restaurant in Berlin")
|
||||||
|
ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
|
||||||
|
# Fine tune the ner model
|
||||||
|
for entity_type in additional_entity_types:
|
||||||
|
if entity_type not in nlp.entity.cfg['actions']['1']:
|
||||||
|
nlp.entity.add_label(entity_type)
|
||||||
|
|
||||||
|
nlp.entity.learn_rate = 0.001
|
||||||
|
for itn in range(4):
|
||||||
|
random.shuffle(train_data)
|
||||||
|
for raw_text, entity_offsets in train_data:
|
||||||
|
doc = nlp.make_doc(raw_text)
|
||||||
|
nlp.tagger(doc)
|
||||||
|
gold = GoldParse(doc, entities=entity_offsets)
|
||||||
|
loss = nlp.entity.update(doc, gold)
|
||||||
|
|
||||||
|
with temp_save_model(nlp.entity) as model_dir:
|
||||||
|
# Load the fine tuned model
|
||||||
|
loaded_ner = EntityRecognizer.load(model_dir, nlp.vocab)
|
||||||
|
|
||||||
|
for entity_type in additional_entity_types:
|
||||||
|
if entity_type not in loaded_ner.cfg['actions']['1']:
|
||||||
|
loaded_ner.add_label(entity_type)
|
||||||
|
|
||||||
|
doc = nlp(u"I am looking for a restaurant in Berlin", entity=False)
|
||||||
|
nlp.tagger(doc)
|
||||||
|
loaded_ner(doc)
|
||||||
|
|
||||||
|
ents_after_train = [(ent.label_, ent.text) for ent in doc.ents]
|
||||||
|
assert ents_before_train == ents_after_train
|
14
spacy/tests/regression/test_issue912.py
Normal file
14
spacy/tests/regression/test_issue912.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from ...tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,tag,lemma', [("inner", "JJ", "inner")])
|
||||||
|
def test_issue912(en_vocab, text, tag, lemma):
|
||||||
|
'''Test base-forms of adjectives are preserved.'''
|
||||||
|
doc = Doc(en_vocab, words=[text])
|
||||||
|
doc[0].tag_ = tag
|
||||||
|
assert doc[0].lemma_ == lemma
|
||||||
|
|
|
@ -153,6 +153,16 @@ def check_renamed_kwargs(renamed, kwargs):
|
||||||
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
||||||
|
|
||||||
|
|
||||||
|
def is_windows():
|
||||||
|
"""Check if user is on Windows."""
|
||||||
|
return sys.platform.startswith('win')
|
||||||
|
|
||||||
|
|
||||||
|
def is_python2():
|
||||||
|
"""Check if Python 2 is used."""
|
||||||
|
return sys.version.startswith('2.')
|
||||||
|
|
||||||
|
|
||||||
def parse_package_meta(package_path, package, require=True):
|
def parse_package_meta(package_path, package, require=True):
|
||||||
location = os.path.join(str(package_path), package, 'meta.json')
|
location = os.path.join(str(package_path), package, 'meta.json')
|
||||||
if os.path.isfile(location):
|
if os.path.isfile(location):
|
||||||
|
|
|
@ -82,7 +82,7 @@ p
|
||||||
| conjunction features out of the atomic predictors. Let's say you have
|
| conjunction features out of the atomic predictors. Let's say you have
|
||||||
| two atomic predictors asking, "What is the part-of-speech of the
|
| two atomic predictors asking, "What is the part-of-speech of the
|
||||||
| previous token?", and "What is the part-of-speech of the previous
|
| previous token?", and "What is the part-of-speech of the previous
|
||||||
| previous token?". These ppredictors will introduce a number of features,
|
| previous token?". These predictors will introduce a number of features,
|
||||||
| e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction
|
| e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction
|
||||||
| template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ].
|
| template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ].
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user