Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-08-20 11:07:08 -05:00
commit 80a5146ec2
14 changed files with 178 additions and 25 deletions

View File

@ -229,7 +229,7 @@ Compile from source
The other way to install spaCy is to clone its
`GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
source. That is the common way if you want to make changes to the code base.
You'll need to make sure that you have a development enviroment consisting of a
You'll need to make sure that you have a development environment consisting of a
Python distribution including header files, a compiler,
`pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.

View File

@ -3,15 +3,21 @@ from __future__ import print_function
# NB! This breaks in plac on Python 2!!
#from __future__ import unicode_literals
if __name__ == '__main__':
import plac
import sys
from spacy.cli import download, link, info, package, train, convert
from spacy.cli import download, link, info, package, train, convert, model
from spacy.util import prints
commands = {'download': download, 'link': link, 'info': info, 'train': train,
'convert': convert, 'package': package}
commands = {
'download': download,
'link': link,
'info': info,
'train': train,
'convert': convert,
'package': package,
'model': model
}
if len(sys.argv) == 1:
prints(', '.join(commands), title="Available commands", exits=1)
command = sys.argv.pop(1)
@ -19,5 +25,7 @@ if __name__ == '__main__':
if command in commands:
plac.call(commands[command])
else:
prints("Available: %s" % ', '.join(commands),
title="Unknown command: %s" % command, exits=1)
prints(
"Available: %s" % ', '.join(commands),
title="Unknown command: %s" % command,
exits=1)

View File

@ -4,3 +4,4 @@ from .link import link
from .package import package
from .train import train
from .convert import convert
from .model import model

119
spacy/cli/model.py Normal file
View File

@ -0,0 +1,119 @@
# coding: utf8
from __future__ import unicode_literals
import gzip
import math
from ast import literal_eval
from pathlib import Path
from preshed.counter import PreshCounter
import spacy
from ..compat import fix_text
from .. import util
def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data):
model_path = Path(model_dir)
freqs_path = Path(freqs_data)
clusters_path = Path(clusters_data) if clusters_data else None
vectors_path = Path(vectors_data) if vectors_data else None
check_dirs(freqs_path, clusters_path, vectors_path)
# vocab = util.get_lang_class(lang).Defaults.create_vocab()
nlp = spacy.blank(lang)
vocab = nlp.vocab
probs, oov_prob = read_probs(freqs_path)
clusters = read_clusters(clusters_path) if clusters_path else {}
populate_vocab(vocab, clusters, probs, oov_prob)
create_model(model_path, nlp)
def create_model(model_path, model):
if not model_path.exists():
model_path.mkdir()
model.to_disk(model_path.as_posix())
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
counts = PreshCounter()
total = 0
freqs_file = check_unzip(freqs_path)
for i, line in enumerate(freqs_file):
freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq)
counts.inc(i + 1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
freqs_file = check_unzip(freqs_path)
probs = {}
for line in freqs_file:
freq, doc_freq, key = line.rstrip().split('\t', 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(
key) < max_length:
word = literal_eval(key)
smooth_count = counts.smoother(int(freq))
probs[word] = math.log(smooth_count) - log_total
oov_prob = math.log(counts.smoother(0)) - log_total
return probs, oov_prob
def read_clusters(clusters_path):
clusters = {}
with clusters_path.open() as f:
for line in f:
try:
cluster, word, freq = line.split()
word = fix_text(word)
except ValueError:
continue
# If the clusterer has only seen the word a few times, its
# cluster is unreliable.
if int(freq) >= 3:
clusters[word] = cluster
else:
clusters[word] = '0'
# Expand clusters with re-casing
for word, cluster in list(clusters.items()):
if word.lower() not in clusters:
clusters[word.lower()] = cluster
if word.title() not in clusters:
clusters[word.title()] = cluster
if word.upper() not in clusters:
clusters[word.upper()] = cluster
return clusters
def populate_vocab(vocab, clusters, probs, oov_prob):
for word, prob in reversed(
sorted(list(probs.items()), key=lambda item: item[1])):
lexeme = vocab[word]
lexeme.prob = prob
lexeme.is_oov = False
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
if word in clusters:
lexeme.cluster = int(clusters[word][::-1], 2)
else:
lexeme.cluster = 0
def check_unzip(file_path):
file_path_str = file_path.as_posix()
if file_path_str.endswith('gz'):
return gzip.open(file_path_str)
else:
return file_path.open()
def check_dirs(freqs_data, clusters_data, vectors_data):
if not freqs_data.is_file():
util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
if clusters_data and not clusters_data.is_file():
util.sys_exit(
clusters_data.as_posix(), title="No Brown clusters file found")
if vectors_data and not vectors_data.is_file():
util.sys_exit(
vectors_data.as_posix(), title="No word vectors file found")

View File

@ -406,11 +406,11 @@ cdef class GoldParse:
if tags is None:
tags = [None for _ in doc]
if heads is None:
heads = [token.i for token in doc]
heads = [None for token in doc]
if deps is None:
deps = [None for _ in doc]
if entities is None:
entities = ['-' for _ in doc]
entities = [None for _ in doc]
elif len(entities) == 0:
entities = ['O' for _ in doc]
elif not isinstance(entities[0], basestring):

View File

@ -200,6 +200,7 @@ class Language(object):
else:
flat_list.append(pipe)
self.pipeline = flat_list
self._optimizer = None
@property
def meta(self):
@ -278,7 +279,7 @@ class Language(object):
return self.tokenizer(text)
def update(self, docs, golds, drop=0., sgd=None, losses=None,
update_tensors=False):
update_shared=False):
"""Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects.
@ -298,6 +299,10 @@ class Language(object):
"Got: %d, %d" % (len(docs), len(golds)))
if len(docs) == 0:
return
if sgd is None:
if self._optimizer is None:
self._optimizer = Adam(Model.ops, 0.001)
sgd = self._optimizer
tok2vec = self.pipeline[0]
feats = tok2vec.doc2feats(docs)
grads = {}
@ -312,9 +317,10 @@ class Language(object):
continue
d_tokvecses = proc.update((docs, tokvecses), golds,
drop=drop, sgd=get_grads, losses=losses)
if update_tensors and d_tokvecses is not None:
if update_shared and d_tokvecses is not None:
for i, d_tv in enumerate(d_tokvecses):
all_d_tokvecses[i] += d_tv
if update_shared and bp_tokvecses is not None:
bp_tokvecses(all_d_tokvecses, sgd=sgd)
for key, (W, dW) in grads.items():
sgd(W, dW, key=key)
@ -378,11 +384,11 @@ class Language(object):
eps = util.env_opt('optimizer_eps', 1e-08)
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
optimizer.max_grad_norm = max_grad_norm
optimizer.device = device
return optimizer
self._optimizer.max_grad_norm = max_grad_norm
self._optimizer.device = device
return self._optimizer
def evaluate(self, docs_golds):
scorer = Scorer()

View File

@ -294,6 +294,8 @@ class NeuralTagger(BaseThincComponent):
doc.is_tagged = True
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs, tokvecs = docs_tokvecs
if self.model.nI is None:
@ -302,6 +304,8 @@ class NeuralTagger(BaseThincComponent):
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
if losses is not None:
losses[self.name] += loss
return d_tokvecs
def get_loss(self, docs, golds, scores):

View File

@ -113,7 +113,7 @@ cdef class BiluoPushDown(TransitionSystem):
def has_gold(self, GoldParse gold, start=0, end=None):
end = end or len(gold.ner)
if all([tag == '-' for tag in gold.ner[start:end]]):
if all([tag in ('-', None) for tag in gold.ner[start:end]]):
return False
else:
return True

View File

@ -483,6 +483,9 @@ cdef class Parser:
return beams
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
docs_tokvecs, golds = self._filter_unlabelled(docs_tokvecs, golds)
if not golds:
return None
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
return self.update_beam(docs_tokvecs, golds,
self.cfg['beam_width'], self.cfg['beam_density'],
@ -555,6 +558,9 @@ cdef class Parser:
def update_beam(self, docs_tokvecs, golds, width=None, density=None,
drop=0., sgd=None, losses=None):
docs_tokvecs, golds = self._filter_unlabelled(docs_tokvecs, golds)
if not golds:
return None
if width is None:
width = self.cfg.get('beam_width', 2)
if density is None:
@ -605,6 +611,15 @@ cdef class Parser:
bp_my_tokvecs(d_tokvecs, sgd=sgd)
return d_tokvecs
def _filter_unlabelled(self, docs_tokvecs, golds):
'''Remove inputs that have no relevant labels before update'''
has_golds = [self.moves.has_gold(gold) for gold in golds]
docs, tokvecs = docs_tokvecs
docs = [docs[i] for i, has_gold in enumerate(has_golds) if has_gold]
tokvecs = [tokvecs[i] for i, has_gold in enumerate(has_golds) if has_gold]
golds = [golds[i] for i, has_gold in enumerate(has_golds) if has_gold]
return (docs, tokvecs), golds
def _init_gold_batch(self, whole_docs, whole_golds):
"""Make a square batch, of length equal to the shortest doc. A long
doc will get multiple states. Let's say we have a doc of length 2*N,

View File

@ -205,7 +205,7 @@ p Retokenize the document, such that the span is merged into a single token.
p
| The token within the span that's highest in the parse tree. If there's a
| tie, the earlist is prefered.
| tie, the earliest is preferred.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')

View File

@ -39,7 +39,7 @@ p
+h(2, "special-cases") Adding special case tokenization rules
p
| Most domains have at least some idiosyncracies that require custom
| Most domains have at least some idiosyncrasies that require custom
| tokenization rules. This could be very certain expressions, or
| abbreviations only used in this specific field.

View File

@ -109,7 +109,7 @@ p
| The other way to install spaCy is to clone its
| #[+a(gh("spaCy")) GitHub repository] and build it from source. That is
| the common way if you want to make changes to the code base. You'll need to
| make sure that you have a development enviroment consisting of a Python
| make sure that you have a development environment consisting of a Python
| distribution including header files, a compiler,
| #[+a("https://pip.pypa.io/en/latest/installing/") pip],
| #[+a("https://virtualenv.pypa.io/") virtualenv] and

View File

@ -40,7 +40,7 @@ p
+cell #[code VerbForm=Fin], #[code Mood=Ind], #[code Tense=Pres]
+row
+cell I read the paper yesteday
+cell I read the paper yesterday
+cell read
+cell read
+cell verb

View File

@ -94,7 +94,7 @@ p
| is mostly intended as a convenient, interactive wrapper. It performs
| compatibility checks and prints detailed error messages and warnings.
| However, if you're downloading models as part of an automated build
| process, this only adds an unecessary layer of complexity. If you know
| process, this only adds an unnecessary layer of complexity. If you know
| which models your application needs, you should be specifying them directly.
p