Merge remote-tracking branch 'upstream/develop' into indonesian

This commit is contained in:
Jim Geovedi 2017-08-25 09:21:49 +08:00
commit 58d8078971
25 changed files with 340 additions and 85 deletions

View File

@ -229,7 +229,7 @@ Compile from source
The other way to install spaCy is to clone its The other way to install spaCy is to clone its
`GitHub repository <https://github.com/explosion/spaCy>`_ and build it from `GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
source. That is the common way if you want to make changes to the code base. source. That is the common way if you want to make changes to the code base.
You'll need to make sure that you have a development enviroment consisting of a You'll need to make sure that you have a development environment consisting of a
Python distribution including header files, a compiler, Python distribution including header files, a compiler,
`pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_ `pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest. and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.

View File

@ -3,15 +3,23 @@ from __future__ import print_function
# NB! This breaks in plac on Python 2!! # NB! This breaks in plac on Python 2!!
#from __future__ import unicode_literals #from __future__ import unicode_literals
if __name__ == '__main__': if __name__ == '__main__':
import plac import plac
import sys import sys
from spacy.cli import download, link, info, package, train, convert from spacy.cli import download, link, info, package, train, convert, model
from spacy.cli import profile
from spacy.util import prints from spacy.util import prints
commands = {'download': download, 'link': link, 'info': info, 'train': train, commands = {
'convert': convert, 'package': package} 'download': download,
'link': link,
'info': info,
'train': train,
'convert': convert,
'package': package,
'model': model,
'profile': profile,
}
if len(sys.argv) == 1: if len(sys.argv) == 1:
prints(', '.join(commands), title="Available commands", exits=1) prints(', '.join(commands), title="Available commands", exits=1)
command = sys.argv.pop(1) command = sys.argv.pop(1)
@ -19,5 +27,7 @@ if __name__ == '__main__':
if command in commands: if command in commands:
plac.call(commands[command]) plac.call(commands[command])
else: else:
prints("Available: %s" % ', '.join(commands), prints(
title="Unknown command: %s" % command, exits=1) "Available: %s" % ', '.join(commands),
title="Unknown command: %s" % command,
exits=1)

View File

@ -218,7 +218,10 @@ def drop_layer(layer, factor=2.):
return layer.begin_update(X, drop=drop) return layer.begin_update(X, drop=drop)
else: else:
return X, lambda dX, sgd=None: dX return X, lambda dX, sgd=None: dX
return wrap(drop_layer_fwd, layer)
model = wrap(drop_layer_fwd, layer)
model.predict = layer
return model
def Tok2Vec(width, embed_size, preprocess=None): def Tok2Vec(width, embed_size, preprocess=None):
@ -359,8 +362,6 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
def backward(d_output, sgd=None): def backward(d_output, sgd=None):
return (tokens, d_output) return (tokens, d_output)
return vectors, backward return vectors, backward
def fine_tune(embedding, combine=None): def fine_tune(embedding, combine=None):
if combine is not None: if combine is not None:
raise NotImplementedError( raise NotImplementedError(
@ -373,22 +374,30 @@ def fine_tune(embedding, combine=None):
flat_tokvecs = embedding.ops.flatten(tokvecs) flat_tokvecs = embedding.ops.flatten(tokvecs)
flat_vecs = embedding.ops.flatten(vecs) flat_vecs = embedding.ops.flatten(vecs)
output = embedding.ops.unflatten( output = embedding.ops.unflatten(
(model.mix[0] * flat_vecs + model.mix[1] * flat_tokvecs), (model.mix[0] * flat_tokvecs + model.mix[1] * flat_vecs), lengths)
lengths)
def fine_tune_bwd(d_output, sgd=None): def fine_tune_bwd(d_output, sgd=None):
bp_vecs(d_output, sgd=sgd)
flat_grad = model.ops.flatten(d_output) flat_grad = model.ops.flatten(d_output)
model.d_mix[1] += flat_tokvecs.dot(flat_grad.T).sum() model.d_mix[0] += flat_tokvecs.dot(flat_grad.T).sum()
model.d_mix[0] += flat_vecs.dot(flat_grad.T).sum() model.d_mix[1] += flat_vecs.dot(flat_grad.T).sum()
bp_vecs([d_o * model.mix[1] for d_o in d_output], sgd=sgd)
if sgd is not None: if sgd is not None:
sgd(model._mem.weights, model._mem.gradient, key=model.id) sgd(model._mem.weights, model._mem.gradient, key=model.id)
return d_output return [d_o * model.mix[0] for d_o in d_output]
return output, fine_tune_bwd return output, fine_tune_bwd
def fine_tune_predict(docs_tokvecs):
docs, tokvecs = docs_tokvecs
vecs = embedding(docs)
return [model.mix[0]*tv+model.mix[1]*v
for tv, v in zip(tokvecs, vecs)]
model = wrap(fine_tune_fwd, embedding) model = wrap(fine_tune_fwd, embedding)
model.mix = model._mem.add((model.id, 'mix'), (2,)) model.mix = model._mem.add((model.id, 'mix'), (2,))
model.mix.fill(1.) model.mix.fill(0.5)
model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix')) model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
model.predict = fine_tune_predict
return model return model

View File

@ -2,5 +2,7 @@ from .download import download
from .info import info from .info import info
from .link import link from .link import link
from .package import package from .package import package
from .profile import profile
from .train import train from .train import train
from .convert import convert from .convert import convert
from .model import model

View File

@ -24,28 +24,29 @@ def download(cmd, model, direct=False):
with version. with version.
""" """
if direct: if direct:
download_model('{m}/{m}.tar.gz'.format(m=model)) dl = download_model('{m}/{m}.tar.gz'.format(m=model))
else: else:
shortcuts = get_json(about.__shortcuts__, "available shortcuts") shortcuts = get_json(about.__shortcuts__, "available shortcuts")
model_name = shortcuts.get(model, model) model_name = shortcuts.get(model, model)
compatibility = get_compatibility() compatibility = get_compatibility()
version = get_version(model_name, compatibility) version = get_version(model_name, compatibility)
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version)) dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
try: if dl == 0:
# Get package path here because link uses try:
# pip.get_installed_distributions() to check if model is a package, # Get package path here because link uses
# which fails if model was just installed via subprocess # pip.get_installed_distributions() to check if model is a package,
package_path = get_package_path(model_name) # which fails if model was just installed via subprocess
link(None, model_name, model, force=True, model_path=package_path) package_path = get_package_path(model_name)
except: link(None, model_name, model, force=True, model_path=package_path)
# Dirty, but since spacy.download and the auto-linking is mostly except:
# a convenience wrapper, it's best to show a success message and # Dirty, but since spacy.download and the auto-linking is mostly
# loading instructions, even if linking fails. # a convenience wrapper, it's best to show a success message and
prints("Creating a shortcut link for 'en' didn't work (maybe you " # loading instructions, even if linking fails.
"don't have admin permissions?), but you can still load " prints("Creating a shortcut link for 'en' didn't work (maybe you "
"the model via its full package name:", "don't have admin permissions?), but you can still load "
"nlp = spacy.load('%s')" % model_name, "the model via its full package name:",
title="Download successful") "nlp = spacy.load('%s')" % model_name,
title="Download successful")
def get_json(url, desc): def get_json(url, desc):
@ -77,6 +78,6 @@ def get_version(model, comp):
def download_model(filename): def download_model(filename):
download_url = about.__download_url__ + '/' + filename download_url = about.__download_url__ + '/' + filename
subprocess.call([sys.executable, '-m', return subprocess.call([sys.executable, '-m',
'pip', 'install', '--no-cache-dir', download_url], 'pip', 'install', '--no-cache-dir', download_url],
env=os.environ.copy()) env=os.environ.copy())

119
spacy/cli/model.py Normal file
View File

@ -0,0 +1,119 @@
# coding: utf8
from __future__ import unicode_literals
import gzip
import math
from ast import literal_eval
from pathlib import Path
from preshed.counter import PreshCounter
import spacy
from ..compat import fix_text
from .. import util
def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data):
model_path = Path(model_dir)
freqs_path = Path(freqs_data)
clusters_path = Path(clusters_data) if clusters_data else None
vectors_path = Path(vectors_data) if vectors_data else None
check_dirs(freqs_path, clusters_path, vectors_path)
# vocab = util.get_lang_class(lang).Defaults.create_vocab()
nlp = spacy.blank(lang)
vocab = nlp.vocab
probs, oov_prob = read_probs(freqs_path)
clusters = read_clusters(clusters_path) if clusters_path else {}
populate_vocab(vocab, clusters, probs, oov_prob)
create_model(model_path, nlp)
def create_model(model_path, model):
if not model_path.exists():
model_path.mkdir()
model.to_disk(model_path.as_posix())
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
counts = PreshCounter()
total = 0
freqs_file = check_unzip(freqs_path)
for i, line in enumerate(freqs_file):
freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq)
counts.inc(i + 1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
freqs_file = check_unzip(freqs_path)
probs = {}
for line in freqs_file:
freq, doc_freq, key = line.rstrip().split('\t', 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(
key) < max_length:
word = literal_eval(key)
smooth_count = counts.smoother(int(freq))
probs[word] = math.log(smooth_count) - log_total
oov_prob = math.log(counts.smoother(0)) - log_total
return probs, oov_prob
def read_clusters(clusters_path):
clusters = {}
with clusters_path.open() as f:
for line in f:
try:
cluster, word, freq = line.split()
word = fix_text(word)
except ValueError:
continue
# If the clusterer has only seen the word a few times, its
# cluster is unreliable.
if int(freq) >= 3:
clusters[word] = cluster
else:
clusters[word] = '0'
# Expand clusters with re-casing
for word, cluster in list(clusters.items()):
if word.lower() not in clusters:
clusters[word.lower()] = cluster
if word.title() not in clusters:
clusters[word.title()] = cluster
if word.upper() not in clusters:
clusters[word.upper()] = cluster
return clusters
def populate_vocab(vocab, clusters, probs, oov_prob):
for word, prob in reversed(
sorted(list(probs.items()), key=lambda item: item[1])):
lexeme = vocab[word]
lexeme.prob = prob
lexeme.is_oov = False
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
if word in clusters:
lexeme.cluster = int(clusters[word][::-1], 2)
else:
lexeme.cluster = 0
def check_unzip(file_path):
file_path_str = file_path.as_posix()
if file_path_str.endswith('gz'):
return gzip.open(file_path_str)
else:
return file_path.open()
def check_dirs(freqs_data, clusters_data, vectors_data):
if not freqs_data.is_file():
util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
if clusters_data and not clusters_data.is_file():
util.sys_exit(
clusters_data.as_posix(), title="No Brown clusters file found")
if vectors_data and not vectors_data.is_file():
util.sys_exit(
vectors_data.as_posix(), title="No word vectors file found")

45
spacy/cli/profile.py Normal file
View File

@ -0,0 +1,45 @@
# coding: utf8
from __future__ import unicode_literals, division, print_function
import plac
from pathlib import Path
import ujson
import cProfile
import pstats
import spacy
import sys
import tqdm
import cytoolz
def read_inputs(loc):
if loc is None:
file_ = sys.stdin
file_ = (line.encode('utf8') for line in file_)
else:
file_ = Path(loc).open()
for line in file_:
data = ujson.loads(line)
text = data['text']
yield text
@plac.annotations(
lang=("model/language", "positional", None, str),
inputs=("Location of input file", "positional", None, read_inputs)
)
def profile(cmd, lang, inputs=None):
"""
Profile a spaCy pipeline, to find out which functions take the most time.
"""
nlp = spacy.load(lang)
texts = list(cytoolz.take(10000, inputs))
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof")
s.strip_dirs().sort_stats("time").print_stats()
def parse_texts(nlp, texts):
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
pass

View File

@ -32,10 +32,12 @@ from ..compat import json_dumps
resume=("Whether to resume training", "flag", "R", bool), resume=("Whether to resume training", "flag", "R", bool),
no_tagger=("Don't train tagger", "flag", "T", bool), no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool), no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool) no_entities=("Don't train NER", "flag", "N", bool),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
) )
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False): use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False,
gold_preproc=False):
""" """
Train a model. Expects data in spaCy's JSON format. Train a model. Expects data in spaCy's JSON format.
""" """
@ -86,13 +88,13 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
i += 20 i += 20
with tqdm.tqdm(total=n_train_words, leave=False) as pbar: with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
train_docs = corpus.train_docs(nlp, projectivize=True, train_docs = corpus.train_docs(nlp, projectivize=True,
gold_preproc=False, max_length=0) gold_preproc=gold_preproc, max_length=0)
losses = {} losses = {}
for batch in minibatch(train_docs, size=batch_sizes): for batch in minibatch(train_docs, size=batch_sizes):
docs, golds = zip(*batch) docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer, nlp.update(docs, golds, sgd=optimizer,
drop=next(dropout_rates), losses=losses, drop=next(dropout_rates), losses=losses,
update_tensors=True) update_shared=True)
pbar.update(sum(len(doc) for doc in docs)) pbar.update(sum(len(doc) for doc in docs))
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
@ -104,7 +106,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
scorer = nlp_loaded.evaluate( scorer = nlp_loaded.evaluate(
corpus.dev_docs( corpus.dev_docs(
nlp_loaded, nlp_loaded,
gold_preproc=False)) gold_preproc=gold_preproc))
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json') acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
with acc_loc.open('w') as file_: with acc_loc.open('w') as file_:
file_.write(json_dumps(scorer.scores)) file_.write(json_dumps(scorer.scores))

View File

@ -60,7 +60,7 @@ GLOSSARY = {
'JJR': 'adjective, comparative', 'JJR': 'adjective, comparative',
'JJS': 'adjective, superlative', 'JJS': 'adjective, superlative',
'LS': 'list item marker', 'LS': 'list item marker',
'MD': 'verb, modal auxillary', 'MD': 'verb, modal auxiliary',
'NIL': 'missing tag', 'NIL': 'missing tag',
'NN': 'noun, singular or mass', 'NN': 'noun, singular or mass',
'NNP': 'noun, proper singular', 'NNP': 'noun, proper singular',
@ -91,7 +91,7 @@ GLOSSARY = {
'NFP': 'superfluous punctuation', 'NFP': 'superfluous punctuation',
'GW': 'additional word in multi-word expression', 'GW': 'additional word in multi-word expression',
'XX': 'unknown', 'XX': 'unknown',
'BES': 'auxillary "be"', 'BES': 'auxiliary "be"',
'HVS': 'forms of "have"', 'HVS': 'forms of "have"',

View File

@ -406,11 +406,11 @@ cdef class GoldParse:
if tags is None: if tags is None:
tags = [None for _ in doc] tags = [None for _ in doc]
if heads is None: if heads is None:
heads = [token.i for token in doc] heads = [None for token in doc]
if deps is None: if deps is None:
deps = [None for _ in doc] deps = [None for _ in doc]
if entities is None: if entities is None:
entities = ['-' for _ in doc] entities = [None for _ in doc]
elif len(entities) == 0: elif len(entities) == 0:
entities = ['O' for _ in doc] entities = ['O' for _ in doc]
elif not isinstance(entities[0], basestring): elif not isinstance(entities[0], basestring):

View File

@ -232,7 +232,10 @@ for verb_data in [
{ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2}, {ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
{ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"}, {ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
{ORTH: "was", LEMMA: "be", NORM: "was"}, {ORTH: "was", LEMMA: "be", NORM: "was"},
{ORTH: "were", LEMMA: "be", NORM: "were"}]: {ORTH: "were", LEMMA: "be", NORM: "were"},
{ORTH: "have", NORM: "have"},
{ORTH: "has", LEMMA: "have", NORM: "has"},
{ORTH: "dare", NORM: "dare"}]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:

View File

@ -200,6 +200,7 @@ class Language(object):
else: else:
flat_list.append(pipe) flat_list.append(pipe)
self.pipeline = flat_list self.pipeline = flat_list
self._optimizer = None
@property @property
def meta(self): def meta(self):
@ -244,7 +245,7 @@ class Language(object):
def matcher(self): def matcher(self):
return self.get_component('matcher') return self.get_component('matcher')
def get_component(self, name): def get_component(self, name):
if self.pipeline in (True, None): if self.pipeline in (True, None):
return None return None
for proc in self.pipeline: for proc in self.pipeline:
@ -278,7 +279,7 @@ class Language(object):
return self.tokenizer(text) return self.tokenizer(text)
def update(self, docs, golds, drop=0., sgd=None, losses=None, def update(self, docs, golds, drop=0., sgd=None, losses=None,
update_tensors=False): update_shared=False):
"""Update the models in the pipeline. """Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects. docs (iterable): A batch of `Doc` objects.
@ -298,6 +299,10 @@ class Language(object):
"Got: %d, %d" % (len(docs), len(golds))) "Got: %d, %d" % (len(docs), len(golds)))
if len(docs) == 0: if len(docs) == 0:
return return
if sgd is None:
if self._optimizer is None:
self._optimizer = Adam(Model.ops, 0.001)
sgd = self._optimizer
tok2vec = self.pipeline[0] tok2vec = self.pipeline[0]
feats = tok2vec.doc2feats(docs) feats = tok2vec.doc2feats(docs)
grads = {} grads = {}
@ -312,10 +317,11 @@ class Language(object):
continue continue
d_tokvecses = proc.update((docs, tokvecses), golds, d_tokvecses = proc.update((docs, tokvecses), golds,
drop=drop, sgd=get_grads, losses=losses) drop=drop, sgd=get_grads, losses=losses)
if update_tensors and d_tokvecses is not None: if update_shared and d_tokvecses is not None:
for i, d_tv in enumerate(d_tokvecses): for i, d_tv in enumerate(d_tokvecses):
all_d_tokvecses[i] += d_tv all_d_tokvecses[i] += d_tv
bp_tokvecses(all_d_tokvecses, sgd=sgd) if update_shared and bp_tokvecses is not None:
bp_tokvecses(all_d_tokvecses, sgd=sgd)
for key, (W, dW) in grads.items(): for key, (W, dW) in grads.items():
sgd(W, dW, key=key) sgd(W, dW, key=key)
# Clear the tensor variable, to free GPU memory. # Clear the tensor variable, to free GPU memory.
@ -378,11 +384,11 @@ class Language(object):
eps = util.env_opt('optimizer_eps', 1e-08) eps = util.env_opt('optimizer_eps', 1e-08)
L2 = util.env_opt('L2_penalty', 1e-6) L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.) max_grad_norm = util.env_opt('grad_norm_clip', 1.)
optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1, self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps) beta2=beta2, eps=eps)
optimizer.max_grad_norm = max_grad_norm self._optimizer.max_grad_norm = max_grad_norm
optimizer.device = device self._optimizer.device = device
return optimizer return self._optimizer
def evaluate(self, docs_golds): def evaluate(self, docs_golds):
scorer = Scorer() scorer = Scorer()

View File

@ -294,6 +294,8 @@ class NeuralTagger(BaseThincComponent):
doc.is_tagged = True doc.is_tagged = True
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs, tokvecs = docs_tokvecs docs, tokvecs = docs_tokvecs
if self.model.nI is None: if self.model.nI is None:
@ -302,6 +304,8 @@ class NeuralTagger(BaseThincComponent):
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd) d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
if losses is not None:
losses[self.name] += loss
return d_tokvecs return d_tokvecs
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):

View File

@ -113,7 +113,7 @@ cdef class BiluoPushDown(TransitionSystem):
def has_gold(self, GoldParse gold, start=0, end=None): def has_gold(self, GoldParse gold, start=0, end=None):
end = end or len(gold.ner) end = end or len(gold.ner)
if all([tag == '-' for tag in gold.ner[start:end]]): if all([tag in ('-', None) for tag in gold.ner[start:end]]):
return False return False
else: else:
return True return True

View File

@ -14,4 +14,8 @@ cdef class Parser:
cdef readonly TransitionSystem moves cdef readonly TransitionSystem moves
cdef readonly object cfg cdef readonly object cfg
cdef void _parse_step(self, StateC* state,
const float* feat_weights,
int nr_class, int nr_feat, int nr_piece) nogil
#cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil

View File

@ -257,10 +257,15 @@ cdef class Parser:
nI=token_vector_width) nI=token_vector_width)
with Model.use_device('cpu'): with Model.use_device('cpu'):
upper = chain( if depth == 0:
clone(Maxout(hidden_width), (depth-1)), upper = chain()
zero_init(Affine(nr_class, drop_factor=0.0)) upper.is_noop = True
) else:
upper = chain(
clone(Maxout(hidden_width), (depth-1)),
zero_init(Affine(nr_class, drop_factor=0.0))
)
upper.is_noop = False
# TODO: This is an unfortunate hack atm! # TODO: This is an unfortunate hack atm!
# Used to set input dimensions in network. # Used to set input dimensions in network.
lower.begin_training(lower.ops.allocate((500, token_vector_width))) lower.begin_training(lower.ops.allocate((500, token_vector_width)))
@ -412,20 +417,27 @@ cdef class Parser:
cdef np.ndarray scores cdef np.ndarray scores
c_token_ids = <int*>token_ids.data c_token_ids = <int*>token_ids.data
c_is_valid = <int*>is_valid.data c_is_valid = <int*>is_valid.data
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
while not next_step.empty(): while not next_step.empty():
for i in range(next_step.size()): if not has_hidden:
st = next_step[i] for i in cython.parallel.prange(
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) next_step.size(), num_threads=6, nogil=True):
self.moves.set_valid(&c_is_valid[i*nr_class], st) self._parse_step(next_step[i],
vectors = state2vec(token_ids[:next_step.size()]) feat_weights, nr_class, nr_feat, nr_piece)
scores = vec2scores(vectors) else:
c_scores = <float*>scores.data for i in range(next_step.size()):
for i in range(next_step.size()): st = next_step[i]
st = next_step[i] st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
guess = arg_max_if_valid( self.moves.set_valid(&c_is_valid[i*nr_class], st)
&c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) vectors = state2vec(token_ids[:next_step.size()])
action = self.moves.c[guess] scores = vec2scores(vectors)
action.do(st, action.label) c_scores = <float*>scores.data
for i in range(next_step.size()):
st = next_step[i]
guess = arg_max_if_valid(
&c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
action = self.moves.c[guess]
action.do(st, action.label)
this_step, next_step = next_step, this_step this_step, next_step = next_step, this_step
next_step.clear() next_step.clear()
for st in this_step: for st in this_step:
@ -482,7 +494,31 @@ cdef class Parser:
beams.append(beam) beams.append(beam)
return beams return beams
cdef void _parse_step(self, StateC* state,
const float* feat_weights,
int nr_class, int nr_feat, int nr_piece) nogil:
'''This only works with no hidden layers -- fast but inaccurate'''
#for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
# self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
token_ids = <int*>calloc(nr_feat, sizeof(int))
scores = <float*>calloc(nr_class * nr_piece, sizeof(float))
is_valid = <int*>calloc(nr_class, sizeof(int))
state.set_context_tokens(token_ids, nr_feat)
sum_state_features(scores,
feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece)
self.moves.set_valid(is_valid, state)
guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece)
action = self.moves.c[guess]
action.do(state, action.label)
free(is_valid)
free(scores)
free(token_ids)
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds):
return None
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5: if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
return self.update_beam(docs_tokvecs, golds, return self.update_beam(docs_tokvecs, golds,
self.cfg['beam_width'], self.cfg['beam_density'], self.cfg['beam_width'], self.cfg['beam_density'],
@ -555,6 +591,10 @@ cdef class Parser:
def update_beam(self, docs_tokvecs, golds, width=None, density=None, def update_beam(self, docs_tokvecs, golds, width=None, density=None,
drop=0., sgd=None, losses=None): drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds):
return None
if not golds:
return None
if width is None: if width is None:
width = self.cfg.get('beam_width', 2) width = self.cfg.get('beam_width', 2)
if density is None: if density is None:

View File

@ -303,8 +303,14 @@ cdef class Doc:
return self.user_hooks['vector'](self) return self.user_hooks['vector'](self)
if self._vector is not None: if self._vector is not None:
return self._vector return self._vector
elif self.has_vector and len(self): elif not len(self):
self._vector = sum(t.vector for t in self) / len(self) self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
return self._vector
elif self.has_vector:
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
for token in self.c[:self.length]:
vector += self.vocab.get_vector(token.lex.orth)
self._vector = vector / len(self)
return self._vector return self._vector
elif self.tensor is not None: elif self.tensor is not None:
self._vector = self.tensor.mean(axis=0) self._vector = self.tensor.mean(axis=0)

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import bz2 import bz2
import ujson import ujson
import re import re
import numpy
from libc.string cimport memset, memcpy from libc.string cimport memset, memcpy
from libc.stdint cimport int32_t from libc.stdint cimport int32_t
@ -244,7 +245,7 @@ cdef class Vocab:
@property @property
def vectors_length(self): def vectors_length(self):
return len(self.vectors) return self.vectors.data.shape[1]
def clear_vectors(self, new_dim=None): def clear_vectors(self, new_dim=None):
"""Drop the current vector table. Because all vectors must be the same """Drop the current vector table. Because all vectors must be the same
@ -268,7 +269,10 @@ cdef class Vocab:
""" """
if isinstance(orth, basestring_): if isinstance(orth, basestring_):
orth = self.strings.add(orth) orth = self.strings.add(orth)
return self.vectors[orth] if orth in self.vectors.key2row:
return self.vectors[orth]
else:
return numpy.zeros((self.vectors_length,), dtype='f')
def set_vector(self, orth, vector): def set_vector(self, orth, vector):
"""Set a vector for a word in the vocabulary. """Set a vector for a word in the vocabulary.

View File

@ -21,7 +21,7 @@ p
+pos-row("$", "SYM", "SymType=currency", "symbol, currency") +pos-row("$", "SYM", "SymType=currency", "symbol, currency")
+pos-row("ADD", "X", "", "email") +pos-row("ADD", "X", "", "email")
+pos-row("AFX", "ADJ", "Hyph=yes", "affix") +pos-row("AFX", "ADJ", "Hyph=yes", "affix")
+pos-row("BES", "VERB", "", 'auxillary "be"') +pos-row("BES", "VERB", "", 'auxiliary "be"')
+pos-row("CC", "CONJ", "ConjType=coor", "conjunction, coordinating") +pos-row("CC", "CONJ", "ConjType=coor", "conjunction, coordinating")
+pos-row("CD", "NUM", "NumType=card", "cardinal number") +pos-row("CD", "NUM", "NumType=card", "cardinal number")
+pos-row("DT", "DET", "determiner") +pos-row("DT", "DET", "determiner")
@ -35,7 +35,7 @@ p
+pos-row("JJR", "ADJ", "Degree=comp", "adjective, comparative") +pos-row("JJR", "ADJ", "Degree=comp", "adjective, comparative")
+pos-row("JJS", "ADJ", "Degree=sup", "adjective, superlative") +pos-row("JJS", "ADJ", "Degree=sup", "adjective, superlative")
+pos-row("LS", "PUNCT", "NumType=ord", "list item marker") +pos-row("LS", "PUNCT", "NumType=ord", "list item marker")
+pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxillary") +pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxiliary")
+pos-row("NFP", "PUNCT", "", "superfluous punctuation") +pos-row("NFP", "PUNCT", "", "superfluous punctuation")
+pos-row("NIL", "", "", "missing tag") +pos-row("NIL", "", "", "missing tag")
+pos-row("NN", "NOUN", "Number=sing", "noun, singular or mass") +pos-row("NN", "NOUN", "Number=sing", "noun, singular or mass")

View File

@ -205,7 +205,7 @@ p Retokenize the document, such that the span is merged into a single token.
p p
| The token within the span that's highest in the parse tree. If there's a | The token within the span that's highest in the parse tree. If there's a
| tie, the earlist is prefered. | tie, the earliest is preferred.
+aside-code("Example"). +aside-code("Example").
doc = nlp(u'I like New York in Autumn.') doc = nlp(u'I like New York in Autumn.')

View File

@ -39,7 +39,7 @@ p
+h(2, "special-cases") Adding special case tokenization rules +h(2, "special-cases") Adding special case tokenization rules
p p
| Most domains have at least some idiosyncracies that require custom | Most domains have at least some idiosyncrasies that require custom
| tokenization rules. This could be very certain expressions, or | tokenization rules. This could be very certain expressions, or
| abbreviations only used in this specific field. | abbreviations only used in this specific field.

View File

@ -109,7 +109,7 @@ p
| The other way to install spaCy is to clone its | The other way to install spaCy is to clone its
| #[+a(gh("spaCy")) GitHub repository] and build it from source. That is | #[+a(gh("spaCy")) GitHub repository] and build it from source. That is
| the common way if you want to make changes to the code base. You'll need to | the common way if you want to make changes to the code base. You'll need to
| make sure that you have a development enviroment consisting of a Python | make sure that you have a development environment consisting of a Python
| distribution including header files, a compiler, | distribution including header files, a compiler,
| #[+a("https://pip.pypa.io/en/latest/installing/") pip], | #[+a("https://pip.pypa.io/en/latest/installing/") pip],
| #[+a("https://virtualenv.pypa.io/") virtualenv] and | #[+a("https://virtualenv.pypa.io/") virtualenv] and

View File

@ -190,10 +190,10 @@ p
+code("Examples", "bash"). +code("Examples", "bash").
# set up shortcut link to load installed package as "en_default" # set up shortcut link to load installed package as "en_default"
python -m spacy link en_core_web_md en_default spacy link en_core_web_md en_default
# set up shortcut link to load local model as "my_amazing_model" # set up shortcut link to load local model as "my_amazing_model"
python -m spacy link /Users/you/model my_amazing_model spacy link /Users/you/model my_amazing_model
+infobox("Important note") +infobox("Important note")
| In order to create a symlink, your user needs the #[strong required permissions]. | In order to create a symlink, your user needs the #[strong required permissions].

View File

@ -40,7 +40,7 @@ p
+cell #[code VerbForm=Fin], #[code Mood=Ind], #[code Tense=Pres] +cell #[code VerbForm=Fin], #[code Mood=Ind], #[code Tense=Pres]
+row +row
+cell I read the paper yesteday +cell I read the paper yesterday
+cell read +cell read
+cell read +cell read
+cell verb +cell verb

View File

@ -94,7 +94,7 @@ p
| is mostly intended as a convenient, interactive wrapper. It performs | is mostly intended as a convenient, interactive wrapper. It performs
| compatibility checks and prints detailed error messages and warnings. | compatibility checks and prints detailed error messages and warnings.
| However, if you're downloading models as part of an automated build | However, if you're downloading models as part of an automated build
| process, this only adds an unecessary layer of complexity. If you know | process, this only adds an unnecessary layer of complexity. If you know
| which models your application needs, you should be specifying them directly. | which models your application needs, you should be specifying them directly.
p p