mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
Merge remote-tracking branch 'upstream/develop' into indonesian
This commit is contained in:
commit
58d8078971
|
@ -229,7 +229,7 @@ Compile from source
|
||||||
The other way to install spaCy is to clone its
|
The other way to install spaCy is to clone its
|
||||||
`GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
|
`GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
|
||||||
source. That is the common way if you want to make changes to the code base.
|
source. That is the common way if you want to make changes to the code base.
|
||||||
You'll need to make sure that you have a development enviroment consisting of a
|
You'll need to make sure that you have a development environment consisting of a
|
||||||
Python distribution including header files, a compiler,
|
Python distribution including header files, a compiler,
|
||||||
`pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
|
`pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
|
||||||
and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.
|
and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.
|
||||||
|
|
|
@ -3,15 +3,23 @@ from __future__ import print_function
|
||||||
# NB! This breaks in plac on Python 2!!
|
# NB! This breaks in plac on Python 2!!
|
||||||
#from __future__ import unicode_literals
|
#from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import plac
|
import plac
|
||||||
import sys
|
import sys
|
||||||
from spacy.cli import download, link, info, package, train, convert
|
from spacy.cli import download, link, info, package, train, convert, model
|
||||||
|
from spacy.cli import profile
|
||||||
from spacy.util import prints
|
from spacy.util import prints
|
||||||
|
|
||||||
commands = {'download': download, 'link': link, 'info': info, 'train': train,
|
commands = {
|
||||||
'convert': convert, 'package': package}
|
'download': download,
|
||||||
|
'link': link,
|
||||||
|
'info': info,
|
||||||
|
'train': train,
|
||||||
|
'convert': convert,
|
||||||
|
'package': package,
|
||||||
|
'model': model,
|
||||||
|
'profile': profile,
|
||||||
|
}
|
||||||
if len(sys.argv) == 1:
|
if len(sys.argv) == 1:
|
||||||
prints(', '.join(commands), title="Available commands", exits=1)
|
prints(', '.join(commands), title="Available commands", exits=1)
|
||||||
command = sys.argv.pop(1)
|
command = sys.argv.pop(1)
|
||||||
|
@ -19,5 +27,7 @@ if __name__ == '__main__':
|
||||||
if command in commands:
|
if command in commands:
|
||||||
plac.call(commands[command])
|
plac.call(commands[command])
|
||||||
else:
|
else:
|
||||||
prints("Available: %s" % ', '.join(commands),
|
prints(
|
||||||
title="Unknown command: %s" % command, exits=1)
|
"Available: %s" % ', '.join(commands),
|
||||||
|
title="Unknown command: %s" % command,
|
||||||
|
exits=1)
|
||||||
|
|
29
spacy/_ml.py
29
spacy/_ml.py
|
@ -218,7 +218,10 @@ def drop_layer(layer, factor=2.):
|
||||||
return layer.begin_update(X, drop=drop)
|
return layer.begin_update(X, drop=drop)
|
||||||
else:
|
else:
|
||||||
return X, lambda dX, sgd=None: dX
|
return X, lambda dX, sgd=None: dX
|
||||||
return wrap(drop_layer_fwd, layer)
|
|
||||||
|
model = wrap(drop_layer_fwd, layer)
|
||||||
|
model.predict = layer
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
def Tok2Vec(width, embed_size, preprocess=None):
|
def Tok2Vec(width, embed_size, preprocess=None):
|
||||||
|
@ -359,8 +362,6 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
|
||||||
def backward(d_output, sgd=None):
|
def backward(d_output, sgd=None):
|
||||||
return (tokens, d_output)
|
return (tokens, d_output)
|
||||||
return vectors, backward
|
return vectors, backward
|
||||||
|
|
||||||
|
|
||||||
def fine_tune(embedding, combine=None):
|
def fine_tune(embedding, combine=None):
|
||||||
if combine is not None:
|
if combine is not None:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
|
@ -373,22 +374,30 @@ def fine_tune(embedding, combine=None):
|
||||||
flat_tokvecs = embedding.ops.flatten(tokvecs)
|
flat_tokvecs = embedding.ops.flatten(tokvecs)
|
||||||
flat_vecs = embedding.ops.flatten(vecs)
|
flat_vecs = embedding.ops.flatten(vecs)
|
||||||
output = embedding.ops.unflatten(
|
output = embedding.ops.unflatten(
|
||||||
(model.mix[0] * flat_vecs + model.mix[1] * flat_tokvecs),
|
(model.mix[0] * flat_tokvecs + model.mix[1] * flat_vecs), lengths)
|
||||||
lengths)
|
|
||||||
|
|
||||||
def fine_tune_bwd(d_output, sgd=None):
|
def fine_tune_bwd(d_output, sgd=None):
|
||||||
bp_vecs(d_output, sgd=sgd)
|
|
||||||
flat_grad = model.ops.flatten(d_output)
|
flat_grad = model.ops.flatten(d_output)
|
||||||
model.d_mix[1] += flat_tokvecs.dot(flat_grad.T).sum()
|
model.d_mix[0] += flat_tokvecs.dot(flat_grad.T).sum()
|
||||||
model.d_mix[0] += flat_vecs.dot(flat_grad.T).sum()
|
model.d_mix[1] += flat_vecs.dot(flat_grad.T).sum()
|
||||||
|
|
||||||
|
bp_vecs([d_o * model.mix[1] for d_o in d_output], sgd=sgd)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
sgd(model._mem.weights, model._mem.gradient, key=model.id)
|
sgd(model._mem.weights, model._mem.gradient, key=model.id)
|
||||||
return d_output
|
return [d_o * model.mix[0] for d_o in d_output]
|
||||||
return output, fine_tune_bwd
|
return output, fine_tune_bwd
|
||||||
|
|
||||||
|
def fine_tune_predict(docs_tokvecs):
|
||||||
|
docs, tokvecs = docs_tokvecs
|
||||||
|
vecs = embedding(docs)
|
||||||
|
return [model.mix[0]*tv+model.mix[1]*v
|
||||||
|
for tv, v in zip(tokvecs, vecs)]
|
||||||
|
|
||||||
model = wrap(fine_tune_fwd, embedding)
|
model = wrap(fine_tune_fwd, embedding)
|
||||||
model.mix = model._mem.add((model.id, 'mix'), (2,))
|
model.mix = model._mem.add((model.id, 'mix'), (2,))
|
||||||
model.mix.fill(1.)
|
model.mix.fill(0.5)
|
||||||
model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
|
model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
|
||||||
|
model.predict = fine_tune_predict
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,5 +2,7 @@ from .download import download
|
||||||
from .info import info
|
from .info import info
|
||||||
from .link import link
|
from .link import link
|
||||||
from .package import package
|
from .package import package
|
||||||
|
from .profile import profile
|
||||||
from .train import train
|
from .train import train
|
||||||
from .convert import convert
|
from .convert import convert
|
||||||
|
from .model import model
|
||||||
|
|
|
@ -24,13 +24,14 @@ def download(cmd, model, direct=False):
|
||||||
with version.
|
with version.
|
||||||
"""
|
"""
|
||||||
if direct:
|
if direct:
|
||||||
download_model('{m}/{m}.tar.gz'.format(m=model))
|
dl = download_model('{m}/{m}.tar.gz'.format(m=model))
|
||||||
else:
|
else:
|
||||||
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
||||||
model_name = shortcuts.get(model, model)
|
model_name = shortcuts.get(model, model)
|
||||||
compatibility = get_compatibility()
|
compatibility = get_compatibility()
|
||||||
version = get_version(model_name, compatibility)
|
version = get_version(model_name, compatibility)
|
||||||
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
||||||
|
if dl == 0:
|
||||||
try:
|
try:
|
||||||
# Get package path here because link uses
|
# Get package path here because link uses
|
||||||
# pip.get_installed_distributions() to check if model is a package,
|
# pip.get_installed_distributions() to check if model is a package,
|
||||||
|
@ -77,6 +78,6 @@ def get_version(model, comp):
|
||||||
|
|
||||||
def download_model(filename):
|
def download_model(filename):
|
||||||
download_url = about.__download_url__ + '/' + filename
|
download_url = about.__download_url__ + '/' + filename
|
||||||
subprocess.call([sys.executable, '-m',
|
return subprocess.call([sys.executable, '-m',
|
||||||
'pip', 'install', '--no-cache-dir', download_url],
|
'pip', 'install', '--no-cache-dir', download_url],
|
||||||
env=os.environ.copy())
|
env=os.environ.copy())
|
||||||
|
|
119
spacy/cli/model.py
Normal file
119
spacy/cli/model.py
Normal file
|
@ -0,0 +1,119 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import gzip
|
||||||
|
import math
|
||||||
|
from ast import literal_eval
|
||||||
|
from pathlib import Path
|
||||||
|
from preshed.counter import PreshCounter
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
from ..compat import fix_text
|
||||||
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data):
|
||||||
|
model_path = Path(model_dir)
|
||||||
|
freqs_path = Path(freqs_data)
|
||||||
|
clusters_path = Path(clusters_data) if clusters_data else None
|
||||||
|
vectors_path = Path(vectors_data) if vectors_data else None
|
||||||
|
|
||||||
|
check_dirs(freqs_path, clusters_path, vectors_path)
|
||||||
|
# vocab = util.get_lang_class(lang).Defaults.create_vocab()
|
||||||
|
nlp = spacy.blank(lang)
|
||||||
|
vocab = nlp.vocab
|
||||||
|
probs, oov_prob = read_probs(freqs_path)
|
||||||
|
clusters = read_clusters(clusters_path) if clusters_path else {}
|
||||||
|
populate_vocab(vocab, clusters, probs, oov_prob)
|
||||||
|
create_model(model_path, nlp)
|
||||||
|
|
||||||
|
|
||||||
|
def create_model(model_path, model):
|
||||||
|
if not model_path.exists():
|
||||||
|
model_path.mkdir()
|
||||||
|
model.to_disk(model_path.as_posix())
|
||||||
|
|
||||||
|
|
||||||
|
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
|
||||||
|
counts = PreshCounter()
|
||||||
|
total = 0
|
||||||
|
freqs_file = check_unzip(freqs_path)
|
||||||
|
for i, line in enumerate(freqs_file):
|
||||||
|
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||||
|
freq = int(freq)
|
||||||
|
counts.inc(i + 1, freq)
|
||||||
|
total += freq
|
||||||
|
counts.smooth()
|
||||||
|
log_total = math.log(total)
|
||||||
|
freqs_file = check_unzip(freqs_path)
|
||||||
|
probs = {}
|
||||||
|
for line in freqs_file:
|
||||||
|
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||||
|
doc_freq = int(doc_freq)
|
||||||
|
freq = int(freq)
|
||||||
|
if doc_freq >= min_doc_freq and freq >= min_freq and len(
|
||||||
|
key) < max_length:
|
||||||
|
word = literal_eval(key)
|
||||||
|
smooth_count = counts.smoother(int(freq))
|
||||||
|
probs[word] = math.log(smooth_count) - log_total
|
||||||
|
oov_prob = math.log(counts.smoother(0)) - log_total
|
||||||
|
return probs, oov_prob
|
||||||
|
|
||||||
|
|
||||||
|
def read_clusters(clusters_path):
|
||||||
|
clusters = {}
|
||||||
|
with clusters_path.open() as f:
|
||||||
|
for line in f:
|
||||||
|
try:
|
||||||
|
cluster, word, freq = line.split()
|
||||||
|
word = fix_text(word)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
# If the clusterer has only seen the word a few times, its
|
||||||
|
# cluster is unreliable.
|
||||||
|
if int(freq) >= 3:
|
||||||
|
clusters[word] = cluster
|
||||||
|
else:
|
||||||
|
clusters[word] = '0'
|
||||||
|
# Expand clusters with re-casing
|
||||||
|
for word, cluster in list(clusters.items()):
|
||||||
|
if word.lower() not in clusters:
|
||||||
|
clusters[word.lower()] = cluster
|
||||||
|
if word.title() not in clusters:
|
||||||
|
clusters[word.title()] = cluster
|
||||||
|
if word.upper() not in clusters:
|
||||||
|
clusters[word.upper()] = cluster
|
||||||
|
return clusters
|
||||||
|
|
||||||
|
|
||||||
|
def populate_vocab(vocab, clusters, probs, oov_prob):
|
||||||
|
for word, prob in reversed(
|
||||||
|
sorted(list(probs.items()), key=lambda item: item[1])):
|
||||||
|
lexeme = vocab[word]
|
||||||
|
lexeme.prob = prob
|
||||||
|
lexeme.is_oov = False
|
||||||
|
# Decode as a little-endian string, so that we can do & 15 to get
|
||||||
|
# the first 4 bits. See _parse_features.pyx
|
||||||
|
if word in clusters:
|
||||||
|
lexeme.cluster = int(clusters[word][::-1], 2)
|
||||||
|
else:
|
||||||
|
lexeme.cluster = 0
|
||||||
|
|
||||||
|
|
||||||
|
def check_unzip(file_path):
|
||||||
|
file_path_str = file_path.as_posix()
|
||||||
|
if file_path_str.endswith('gz'):
|
||||||
|
return gzip.open(file_path_str)
|
||||||
|
else:
|
||||||
|
return file_path.open()
|
||||||
|
|
||||||
|
|
||||||
|
def check_dirs(freqs_data, clusters_data, vectors_data):
|
||||||
|
if not freqs_data.is_file():
|
||||||
|
util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
|
||||||
|
if clusters_data and not clusters_data.is_file():
|
||||||
|
util.sys_exit(
|
||||||
|
clusters_data.as_posix(), title="No Brown clusters file found")
|
||||||
|
if vectors_data and not vectors_data.is_file():
|
||||||
|
util.sys_exit(
|
||||||
|
vectors_data.as_posix(), title="No word vectors file found")
|
45
spacy/cli/profile.py
Normal file
45
spacy/cli/profile.py
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
|
import plac
|
||||||
|
from pathlib import Path
|
||||||
|
import ujson
|
||||||
|
import cProfile
|
||||||
|
import pstats
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
import sys
|
||||||
|
import tqdm
|
||||||
|
import cytoolz
|
||||||
|
|
||||||
|
|
||||||
|
def read_inputs(loc):
|
||||||
|
if loc is None:
|
||||||
|
file_ = sys.stdin
|
||||||
|
file_ = (line.encode('utf8') for line in file_)
|
||||||
|
else:
|
||||||
|
file_ = Path(loc).open()
|
||||||
|
for line in file_:
|
||||||
|
data = ujson.loads(line)
|
||||||
|
text = data['text']
|
||||||
|
yield text
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
lang=("model/language", "positional", None, str),
|
||||||
|
inputs=("Location of input file", "positional", None, read_inputs)
|
||||||
|
)
|
||||||
|
def profile(cmd, lang, inputs=None):
|
||||||
|
"""
|
||||||
|
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||||
|
"""
|
||||||
|
nlp = spacy.load(lang)
|
||||||
|
texts = list(cytoolz.take(10000, inputs))
|
||||||
|
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||||
|
s = pstats.Stats("Profile.prof")
|
||||||
|
s.strip_dirs().sort_stats("time").print_stats()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_texts(nlp, texts):
|
||||||
|
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
|
||||||
|
pass
|
|
@ -32,10 +32,12 @@ from ..compat import json_dumps
|
||||||
resume=("Whether to resume training", "flag", "R", bool),
|
resume=("Whether to resume training", "flag", "R", bool),
|
||||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
no_tagger=("Don't train tagger", "flag", "T", bool),
|
||||||
no_parser=("Don't train parser", "flag", "P", bool),
|
no_parser=("Don't train parser", "flag", "P", bool),
|
||||||
no_entities=("Don't train NER", "flag", "N", bool)
|
no_entities=("Don't train NER", "flag", "N", bool),
|
||||||
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
)
|
)
|
||||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False):
|
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False,
|
||||||
|
gold_preproc=False):
|
||||||
"""
|
"""
|
||||||
Train a model. Expects data in spaCy's JSON format.
|
Train a model. Expects data in spaCy's JSON format.
|
||||||
"""
|
"""
|
||||||
|
@ -86,13 +88,13 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
i += 20
|
i += 20
|
||||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||||
train_docs = corpus.train_docs(nlp, projectivize=True,
|
train_docs = corpus.train_docs(nlp, projectivize=True,
|
||||||
gold_preproc=False, max_length=0)
|
gold_preproc=gold_preproc, max_length=0)
|
||||||
losses = {}
|
losses = {}
|
||||||
for batch in minibatch(train_docs, size=batch_sizes):
|
for batch in minibatch(train_docs, size=batch_sizes):
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
nlp.update(docs, golds, sgd=optimizer,
|
nlp.update(docs, golds, sgd=optimizer,
|
||||||
drop=next(dropout_rates), losses=losses,
|
drop=next(dropout_rates), losses=losses,
|
||||||
update_tensors=True)
|
update_shared=True)
|
||||||
pbar.update(sum(len(doc) for doc in docs))
|
pbar.update(sum(len(doc) for doc in docs))
|
||||||
|
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
|
@ -104,7 +106,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
scorer = nlp_loaded.evaluate(
|
scorer = nlp_loaded.evaluate(
|
||||||
corpus.dev_docs(
|
corpus.dev_docs(
|
||||||
nlp_loaded,
|
nlp_loaded,
|
||||||
gold_preproc=False))
|
gold_preproc=gold_preproc))
|
||||||
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
|
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
|
||||||
with acc_loc.open('w') as file_:
|
with acc_loc.open('w') as file_:
|
||||||
file_.write(json_dumps(scorer.scores))
|
file_.write(json_dumps(scorer.scores))
|
||||||
|
|
|
@ -60,7 +60,7 @@ GLOSSARY = {
|
||||||
'JJR': 'adjective, comparative',
|
'JJR': 'adjective, comparative',
|
||||||
'JJS': 'adjective, superlative',
|
'JJS': 'adjective, superlative',
|
||||||
'LS': 'list item marker',
|
'LS': 'list item marker',
|
||||||
'MD': 'verb, modal auxillary',
|
'MD': 'verb, modal auxiliary',
|
||||||
'NIL': 'missing tag',
|
'NIL': 'missing tag',
|
||||||
'NN': 'noun, singular or mass',
|
'NN': 'noun, singular or mass',
|
||||||
'NNP': 'noun, proper singular',
|
'NNP': 'noun, proper singular',
|
||||||
|
@ -91,7 +91,7 @@ GLOSSARY = {
|
||||||
'NFP': 'superfluous punctuation',
|
'NFP': 'superfluous punctuation',
|
||||||
'GW': 'additional word in multi-word expression',
|
'GW': 'additional word in multi-word expression',
|
||||||
'XX': 'unknown',
|
'XX': 'unknown',
|
||||||
'BES': 'auxillary "be"',
|
'BES': 'auxiliary "be"',
|
||||||
'HVS': 'forms of "have"',
|
'HVS': 'forms of "have"',
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -406,11 +406,11 @@ cdef class GoldParse:
|
||||||
if tags is None:
|
if tags is None:
|
||||||
tags = [None for _ in doc]
|
tags = [None for _ in doc]
|
||||||
if heads is None:
|
if heads is None:
|
||||||
heads = [token.i for token in doc]
|
heads = [None for token in doc]
|
||||||
if deps is None:
|
if deps is None:
|
||||||
deps = [None for _ in doc]
|
deps = [None for _ in doc]
|
||||||
if entities is None:
|
if entities is None:
|
||||||
entities = ['-' for _ in doc]
|
entities = [None for _ in doc]
|
||||||
elif len(entities) == 0:
|
elif len(entities) == 0:
|
||||||
entities = ['O' for _ in doc]
|
entities = ['O' for _ in doc]
|
||||||
elif not isinstance(entities[0], basestring):
|
elif not isinstance(entities[0], basestring):
|
||||||
|
|
|
@ -232,7 +232,10 @@ for verb_data in [
|
||||||
{ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
|
{ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
|
||||||
{ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
|
{ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
|
||||||
{ORTH: "was", LEMMA: "be", NORM: "was"},
|
{ORTH: "was", LEMMA: "be", NORM: "was"},
|
||||||
{ORTH: "were", LEMMA: "be", NORM: "were"}]:
|
{ORTH: "were", LEMMA: "be", NORM: "were"},
|
||||||
|
{ORTH: "have", NORM: "have"},
|
||||||
|
{ORTH: "has", LEMMA: "have", NORM: "has"},
|
||||||
|
{ORTH: "dare", NORM: "dare"}]:
|
||||||
verb_data_tc = dict(verb_data)
|
verb_data_tc = dict(verb_data)
|
||||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||||
for data in [verb_data, verb_data_tc]:
|
for data in [verb_data, verb_data_tc]:
|
||||||
|
|
|
@ -200,6 +200,7 @@ class Language(object):
|
||||||
else:
|
else:
|
||||||
flat_list.append(pipe)
|
flat_list.append(pipe)
|
||||||
self.pipeline = flat_list
|
self.pipeline = flat_list
|
||||||
|
self._optimizer = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def meta(self):
|
def meta(self):
|
||||||
|
@ -278,7 +279,7 @@ class Language(object):
|
||||||
return self.tokenizer(text)
|
return self.tokenizer(text)
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None,
|
def update(self, docs, golds, drop=0., sgd=None, losses=None,
|
||||||
update_tensors=False):
|
update_shared=False):
|
||||||
"""Update the models in the pipeline.
|
"""Update the models in the pipeline.
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
docs (iterable): A batch of `Doc` objects.
|
||||||
|
@ -298,6 +299,10 @@ class Language(object):
|
||||||
"Got: %d, %d" % (len(docs), len(golds)))
|
"Got: %d, %d" % (len(docs), len(golds)))
|
||||||
if len(docs) == 0:
|
if len(docs) == 0:
|
||||||
return
|
return
|
||||||
|
if sgd is None:
|
||||||
|
if self._optimizer is None:
|
||||||
|
self._optimizer = Adam(Model.ops, 0.001)
|
||||||
|
sgd = self._optimizer
|
||||||
tok2vec = self.pipeline[0]
|
tok2vec = self.pipeline[0]
|
||||||
feats = tok2vec.doc2feats(docs)
|
feats = tok2vec.doc2feats(docs)
|
||||||
grads = {}
|
grads = {}
|
||||||
|
@ -312,9 +317,10 @@ class Language(object):
|
||||||
continue
|
continue
|
||||||
d_tokvecses = proc.update((docs, tokvecses), golds,
|
d_tokvecses = proc.update((docs, tokvecses), golds,
|
||||||
drop=drop, sgd=get_grads, losses=losses)
|
drop=drop, sgd=get_grads, losses=losses)
|
||||||
if update_tensors and d_tokvecses is not None:
|
if update_shared and d_tokvecses is not None:
|
||||||
for i, d_tv in enumerate(d_tokvecses):
|
for i, d_tv in enumerate(d_tokvecses):
|
||||||
all_d_tokvecses[i] += d_tv
|
all_d_tokvecses[i] += d_tv
|
||||||
|
if update_shared and bp_tokvecses is not None:
|
||||||
bp_tokvecses(all_d_tokvecses, sgd=sgd)
|
bp_tokvecses(all_d_tokvecses, sgd=sgd)
|
||||||
for key, (W, dW) in grads.items():
|
for key, (W, dW) in grads.items():
|
||||||
sgd(W, dW, key=key)
|
sgd(W, dW, key=key)
|
||||||
|
@ -378,11 +384,11 @@ class Language(object):
|
||||||
eps = util.env_opt('optimizer_eps', 1e-08)
|
eps = util.env_opt('optimizer_eps', 1e-08)
|
||||||
L2 = util.env_opt('L2_penalty', 1e-6)
|
L2 = util.env_opt('L2_penalty', 1e-6)
|
||||||
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
||||||
optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
||||||
beta2=beta2, eps=eps)
|
beta2=beta2, eps=eps)
|
||||||
optimizer.max_grad_norm = max_grad_norm
|
self._optimizer.max_grad_norm = max_grad_norm
|
||||||
optimizer.device = device
|
self._optimizer.device = device
|
||||||
return optimizer
|
return self._optimizer
|
||||||
|
|
||||||
def evaluate(self, docs_golds):
|
def evaluate(self, docs_golds):
|
||||||
scorer = Scorer()
|
scorer = Scorer()
|
||||||
|
|
|
@ -294,6 +294,8 @@ class NeuralTagger(BaseThincComponent):
|
||||||
doc.is_tagged = True
|
doc.is_tagged = True
|
||||||
|
|
||||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
||||||
|
if losses is not None and self.name not in losses:
|
||||||
|
losses[self.name] = 0.
|
||||||
docs, tokvecs = docs_tokvecs
|
docs, tokvecs = docs_tokvecs
|
||||||
|
|
||||||
if self.model.nI is None:
|
if self.model.nI is None:
|
||||||
|
@ -302,6 +304,8 @@ class NeuralTagger(BaseThincComponent):
|
||||||
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
||||||
|
|
||||||
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
|
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
|
||||||
|
if losses is not None:
|
||||||
|
losses[self.name] += loss
|
||||||
return d_tokvecs
|
return d_tokvecs
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
|
|
|
@ -113,7 +113,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
|
|
||||||
def has_gold(self, GoldParse gold, start=0, end=None):
|
def has_gold(self, GoldParse gold, start=0, end=None):
|
||||||
end = end or len(gold.ner)
|
end = end or len(gold.ner)
|
||||||
if all([tag == '-' for tag in gold.ner[start:end]]):
|
if all([tag in ('-', None) for tag in gold.ner[start:end]]):
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -14,4 +14,8 @@ cdef class Parser:
|
||||||
cdef readonly TransitionSystem moves
|
cdef readonly TransitionSystem moves
|
||||||
cdef readonly object cfg
|
cdef readonly object cfg
|
||||||
|
|
||||||
|
cdef void _parse_step(self, StateC* state,
|
||||||
|
const float* feat_weights,
|
||||||
|
int nr_class, int nr_feat, int nr_piece) nogil
|
||||||
|
|
||||||
#cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
|
#cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
|
||||||
|
|
|
@ -257,10 +257,15 @@ cdef class Parser:
|
||||||
nI=token_vector_width)
|
nI=token_vector_width)
|
||||||
|
|
||||||
with Model.use_device('cpu'):
|
with Model.use_device('cpu'):
|
||||||
|
if depth == 0:
|
||||||
|
upper = chain()
|
||||||
|
upper.is_noop = True
|
||||||
|
else:
|
||||||
upper = chain(
|
upper = chain(
|
||||||
clone(Maxout(hidden_width), (depth-1)),
|
clone(Maxout(hidden_width), (depth-1)),
|
||||||
zero_init(Affine(nr_class, drop_factor=0.0))
|
zero_init(Affine(nr_class, drop_factor=0.0))
|
||||||
)
|
)
|
||||||
|
upper.is_noop = False
|
||||||
# TODO: This is an unfortunate hack atm!
|
# TODO: This is an unfortunate hack atm!
|
||||||
# Used to set input dimensions in network.
|
# Used to set input dimensions in network.
|
||||||
lower.begin_training(lower.ops.allocate((500, token_vector_width)))
|
lower.begin_training(lower.ops.allocate((500, token_vector_width)))
|
||||||
|
@ -412,7 +417,14 @@ cdef class Parser:
|
||||||
cdef np.ndarray scores
|
cdef np.ndarray scores
|
||||||
c_token_ids = <int*>token_ids.data
|
c_token_ids = <int*>token_ids.data
|
||||||
c_is_valid = <int*>is_valid.data
|
c_is_valid = <int*>is_valid.data
|
||||||
|
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
|
||||||
while not next_step.empty():
|
while not next_step.empty():
|
||||||
|
if not has_hidden:
|
||||||
|
for i in cython.parallel.prange(
|
||||||
|
next_step.size(), num_threads=6, nogil=True):
|
||||||
|
self._parse_step(next_step[i],
|
||||||
|
feat_weights, nr_class, nr_feat, nr_piece)
|
||||||
|
else:
|
||||||
for i in range(next_step.size()):
|
for i in range(next_step.size()):
|
||||||
st = next_step[i]
|
st = next_step[i]
|
||||||
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
|
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
|
||||||
|
@ -482,7 +494,31 @@ cdef class Parser:
|
||||||
beams.append(beam)
|
beams.append(beam)
|
||||||
return beams
|
return beams
|
||||||
|
|
||||||
|
cdef void _parse_step(self, StateC* state,
|
||||||
|
const float* feat_weights,
|
||||||
|
int nr_class, int nr_feat, int nr_piece) nogil:
|
||||||
|
'''This only works with no hidden layers -- fast but inaccurate'''
|
||||||
|
#for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
|
||||||
|
# self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
|
||||||
|
token_ids = <int*>calloc(nr_feat, sizeof(int))
|
||||||
|
scores = <float*>calloc(nr_class * nr_piece, sizeof(float))
|
||||||
|
is_valid = <int*>calloc(nr_class, sizeof(int))
|
||||||
|
|
||||||
|
state.set_context_tokens(token_ids, nr_feat)
|
||||||
|
sum_state_features(scores,
|
||||||
|
feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece)
|
||||||
|
self.moves.set_valid(is_valid, state)
|
||||||
|
guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece)
|
||||||
|
action = self.moves.c[guess]
|
||||||
|
action.do(state, action.label)
|
||||||
|
|
||||||
|
free(is_valid)
|
||||||
|
free(scores)
|
||||||
|
free(token_ids)
|
||||||
|
|
||||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
||||||
|
if not any(self.moves.has_gold(gold) for gold in golds):
|
||||||
|
return None
|
||||||
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
|
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
|
||||||
return self.update_beam(docs_tokvecs, golds,
|
return self.update_beam(docs_tokvecs, golds,
|
||||||
self.cfg['beam_width'], self.cfg['beam_density'],
|
self.cfg['beam_width'], self.cfg['beam_density'],
|
||||||
|
@ -555,6 +591,10 @@ cdef class Parser:
|
||||||
|
|
||||||
def update_beam(self, docs_tokvecs, golds, width=None, density=None,
|
def update_beam(self, docs_tokvecs, golds, width=None, density=None,
|
||||||
drop=0., sgd=None, losses=None):
|
drop=0., sgd=None, losses=None):
|
||||||
|
if not any(self.moves.has_gold(gold) for gold in golds):
|
||||||
|
return None
|
||||||
|
if not golds:
|
||||||
|
return None
|
||||||
if width is None:
|
if width is None:
|
||||||
width = self.cfg.get('beam_width', 2)
|
width = self.cfg.get('beam_width', 2)
|
||||||
if density is None:
|
if density is None:
|
||||||
|
|
|
@ -303,8 +303,14 @@ cdef class Doc:
|
||||||
return self.user_hooks['vector'](self)
|
return self.user_hooks['vector'](self)
|
||||||
if self._vector is not None:
|
if self._vector is not None:
|
||||||
return self._vector
|
return self._vector
|
||||||
elif self.has_vector and len(self):
|
elif not len(self):
|
||||||
self._vector = sum(t.vector for t in self) / len(self)
|
self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
|
||||||
|
return self._vector
|
||||||
|
elif self.has_vector:
|
||||||
|
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
|
||||||
|
for token in self.c[:self.length]:
|
||||||
|
vector += self.vocab.get_vector(token.lex.orth)
|
||||||
|
self._vector = vector / len(self)
|
||||||
return self._vector
|
return self._vector
|
||||||
elif self.tensor is not None:
|
elif self.tensor is not None:
|
||||||
self._vector = self.tensor.mean(axis=0)
|
self._vector = self.tensor.mean(axis=0)
|
||||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
||||||
import bz2
|
import bz2
|
||||||
import ujson
|
import ujson
|
||||||
import re
|
import re
|
||||||
|
import numpy
|
||||||
|
|
||||||
from libc.string cimport memset, memcpy
|
from libc.string cimport memset, memcpy
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
|
@ -244,7 +245,7 @@ cdef class Vocab:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vectors_length(self):
|
def vectors_length(self):
|
||||||
return len(self.vectors)
|
return self.vectors.data.shape[1]
|
||||||
|
|
||||||
def clear_vectors(self, new_dim=None):
|
def clear_vectors(self, new_dim=None):
|
||||||
"""Drop the current vector table. Because all vectors must be the same
|
"""Drop the current vector table. Because all vectors must be the same
|
||||||
|
@ -268,7 +269,10 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, basestring_):
|
if isinstance(orth, basestring_):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
|
if orth in self.vectors.key2row:
|
||||||
return self.vectors[orth]
|
return self.vectors[orth]
|
||||||
|
else:
|
||||||
|
return numpy.zeros((self.vectors_length,), dtype='f')
|
||||||
|
|
||||||
def set_vector(self, orth, vector):
|
def set_vector(self, orth, vector):
|
||||||
"""Set a vector for a word in the vocabulary.
|
"""Set a vector for a word in the vocabulary.
|
||||||
|
|
|
@ -21,7 +21,7 @@ p
|
||||||
+pos-row("$", "SYM", "SymType=currency", "symbol, currency")
|
+pos-row("$", "SYM", "SymType=currency", "symbol, currency")
|
||||||
+pos-row("ADD", "X", "", "email")
|
+pos-row("ADD", "X", "", "email")
|
||||||
+pos-row("AFX", "ADJ", "Hyph=yes", "affix")
|
+pos-row("AFX", "ADJ", "Hyph=yes", "affix")
|
||||||
+pos-row("BES", "VERB", "", 'auxillary "be"')
|
+pos-row("BES", "VERB", "", 'auxiliary "be"')
|
||||||
+pos-row("CC", "CONJ", "ConjType=coor", "conjunction, coordinating")
|
+pos-row("CC", "CONJ", "ConjType=coor", "conjunction, coordinating")
|
||||||
+pos-row("CD", "NUM", "NumType=card", "cardinal number")
|
+pos-row("CD", "NUM", "NumType=card", "cardinal number")
|
||||||
+pos-row("DT", "DET", "determiner")
|
+pos-row("DT", "DET", "determiner")
|
||||||
|
@ -35,7 +35,7 @@ p
|
||||||
+pos-row("JJR", "ADJ", "Degree=comp", "adjective, comparative")
|
+pos-row("JJR", "ADJ", "Degree=comp", "adjective, comparative")
|
||||||
+pos-row("JJS", "ADJ", "Degree=sup", "adjective, superlative")
|
+pos-row("JJS", "ADJ", "Degree=sup", "adjective, superlative")
|
||||||
+pos-row("LS", "PUNCT", "NumType=ord", "list item marker")
|
+pos-row("LS", "PUNCT", "NumType=ord", "list item marker")
|
||||||
+pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxillary")
|
+pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxiliary")
|
||||||
+pos-row("NFP", "PUNCT", "", "superfluous punctuation")
|
+pos-row("NFP", "PUNCT", "", "superfluous punctuation")
|
||||||
+pos-row("NIL", "", "", "missing tag")
|
+pos-row("NIL", "", "", "missing tag")
|
||||||
+pos-row("NN", "NOUN", "Number=sing", "noun, singular or mass")
|
+pos-row("NN", "NOUN", "Number=sing", "noun, singular or mass")
|
||||||
|
|
|
@ -205,7 +205,7 @@ p Retokenize the document, such that the span is merged into a single token.
|
||||||
|
|
||||||
p
|
p
|
||||||
| The token within the span that's highest in the parse tree. If there's a
|
| The token within the span that's highest in the parse tree. If there's a
|
||||||
| tie, the earlist is prefered.
|
| tie, the earliest is preferred.
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
doc = nlp(u'I like New York in Autumn.')
|
doc = nlp(u'I like New York in Autumn.')
|
||||||
|
|
|
@ -39,7 +39,7 @@ p
|
||||||
+h(2, "special-cases") Adding special case tokenization rules
|
+h(2, "special-cases") Adding special case tokenization rules
|
||||||
|
|
||||||
p
|
p
|
||||||
| Most domains have at least some idiosyncracies that require custom
|
| Most domains have at least some idiosyncrasies that require custom
|
||||||
| tokenization rules. This could be very certain expressions, or
|
| tokenization rules. This could be very certain expressions, or
|
||||||
| abbreviations only used in this specific field.
|
| abbreviations only used in this specific field.
|
||||||
|
|
||||||
|
|
|
@ -109,7 +109,7 @@ p
|
||||||
| The other way to install spaCy is to clone its
|
| The other way to install spaCy is to clone its
|
||||||
| #[+a(gh("spaCy")) GitHub repository] and build it from source. That is
|
| #[+a(gh("spaCy")) GitHub repository] and build it from source. That is
|
||||||
| the common way if you want to make changes to the code base. You'll need to
|
| the common way if you want to make changes to the code base. You'll need to
|
||||||
| make sure that you have a development enviroment consisting of a Python
|
| make sure that you have a development environment consisting of a Python
|
||||||
| distribution including header files, a compiler,
|
| distribution including header files, a compiler,
|
||||||
| #[+a("https://pip.pypa.io/en/latest/installing/") pip],
|
| #[+a("https://pip.pypa.io/en/latest/installing/") pip],
|
||||||
| #[+a("https://virtualenv.pypa.io/") virtualenv] and
|
| #[+a("https://virtualenv.pypa.io/") virtualenv] and
|
||||||
|
|
|
@ -190,10 +190,10 @@ p
|
||||||
|
|
||||||
+code("Examples", "bash").
|
+code("Examples", "bash").
|
||||||
# set up shortcut link to load installed package as "en_default"
|
# set up shortcut link to load installed package as "en_default"
|
||||||
python -m spacy link en_core_web_md en_default
|
spacy link en_core_web_md en_default
|
||||||
|
|
||||||
# set up shortcut link to load local model as "my_amazing_model"
|
# set up shortcut link to load local model as "my_amazing_model"
|
||||||
python -m spacy link /Users/you/model my_amazing_model
|
spacy link /Users/you/model my_amazing_model
|
||||||
|
|
||||||
+infobox("Important note")
|
+infobox("Important note")
|
||||||
| In order to create a symlink, your user needs the #[strong required permissions].
|
| In order to create a symlink, your user needs the #[strong required permissions].
|
||||||
|
|
|
@ -40,7 +40,7 @@ p
|
||||||
+cell #[code VerbForm=Fin], #[code Mood=Ind], #[code Tense=Pres]
|
+cell #[code VerbForm=Fin], #[code Mood=Ind], #[code Tense=Pres]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell I read the paper yesteday
|
+cell I read the paper yesterday
|
||||||
+cell read
|
+cell read
|
||||||
+cell read
|
+cell read
|
||||||
+cell verb
|
+cell verb
|
||||||
|
|
|
@ -94,7 +94,7 @@ p
|
||||||
| is mostly intended as a convenient, interactive wrapper. It performs
|
| is mostly intended as a convenient, interactive wrapper. It performs
|
||||||
| compatibility checks and prints detailed error messages and warnings.
|
| compatibility checks and prints detailed error messages and warnings.
|
||||||
| However, if you're downloading models as part of an automated build
|
| However, if you're downloading models as part of an automated build
|
||||||
| process, this only adds an unecessary layer of complexity. If you know
|
| process, this only adds an unnecessary layer of complexity. If you know
|
||||||
| which models your application needs, you should be specifying them directly.
|
| which models your application needs, you should be specifying them directly.
|
||||||
|
|
||||||
p
|
p
|
||||||
|
|
Loading…
Reference in New Issue
Block a user