Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-10-01 22:10:54 +02:00
commit e38089d598
7 changed files with 109 additions and 8 deletions

View File

@ -7,7 +7,7 @@ if __name__ == '__main__':
import plac import plac
import sys import sys
from spacy.cli import download, link, info, package, train, convert, model from spacy.cli import download, link, info, package, train, convert, model
from spacy.cli import profile from spacy.cli import profile, evaluate
from spacy.util import prints from spacy.util import prints
commands = { commands = {
@ -15,6 +15,7 @@ if __name__ == '__main__':
'link': link, 'link': link,
'info': info, 'info': info,
'train': train, 'train': train,
'evaluate': evaluate,
'convert': convert, 'convert': convert,
'package': package, 'package': package,
'model': model, 'model': model,

View File

@ -4,5 +4,6 @@ from .link import link
from .package import package from .package import package
from .profile import profile from .profile import profile
from .train import train from .train import train
from .evaluate import evaluate
from .convert import convert from .convert import convert
from .model import model from .model import model

93
spacy/cli/evaluate.py Normal file
View File

@ -0,0 +1,93 @@
# coding: utf8
from __future__ import unicode_literals, division, print_function
import plac
import json
from collections import defaultdict
import cytoolz
from pathlib import Path
import dill
import tqdm
from thinc.neural._classes.model import Model
from thinc.neural.optimizers import linear_decay
from timeit import default_timer as timer
import random
import numpy.random
from ..tokens.doc import Doc
from ..scorer import Scorer
from ..gold import GoldParse, merge_sents
from ..gold import GoldCorpus, minibatch
from ..util import prints
from .. import util
from .. import about
from .. import displacy
from ..compat import json_dumps
random.seed(0)
numpy.random.seed(0)
@plac.annotations(
model=("Model name or path", "positional", None, str),
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
)
def evaluate(cmd, model, data_path, gold_preproc=False):
"""
Train a model. Expects data in spaCy's JSON format.
"""
util.set_env_log(True)
data_path = util.ensure_path(data_path)
if not data_path.exists():
prints(data_path, title="Evaluation data not found", exits=1)
corpus = GoldCorpus(data_path, data_path)
nlp = util.load_model(model)
scorer = nlp.evaluate(list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)))
print_results(scorer)
def _render_parses(i, to_render):
to_render[0].user_data['title'] = "Batch %d" % i
with Path('/tmp/entities.html').open('w') as file_:
html = displacy.render(to_render[:5], style='ent', page=True)
file_.write(html)
with Path('/tmp/parses.html').open('w') as file_:
html = displacy.render(to_render[:5], style='dep', page=True)
file_.write(html)
def print_progress(itn, losses, dev_scores, wps=0.0):
scores = {}
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
'ents_p', 'ents_r', 'ents_f', 'wps']:
scores[col] = 0.0
scores['dep_loss'] = losses.get('parser', 0.0)
scores['ner_loss'] = losses.get('ner', 0.0)
scores['tag_loss'] = losses.get('tagger', 0.0)
scores.update(dev_scores)
scores['wps'] = wps
tpl = '\t'.join((
'{:d}',
'{dep_loss:.3f}',
'{ner_loss:.3f}',
'{uas:.3f}',
'{ents_p:.3f}',
'{ents_r:.3f}',
'{ents_f:.3f}',
'{tags_acc:.3f}',
'{token_acc:.3f}',
'{wps:.1f}'))
print(tpl.format(itn, **scores))
def print_results(scorer):
results = {
'TOK': '%.2f' % scorer.token_acc,
'POS': '%.2f' % scorer.tags_acc,
'UAS': '%.2f' % scorer.uas,
'LAS': '%.2f' % scorer.las,
'NER P': '%.2f' % scorer.ents_p,
'NER R': '%.2f' % scorer.ents_r,
'NER F': '%.2f' % scorer.ents_f}
util.print_table(results, title="Results")

View File

@ -105,8 +105,11 @@ def generate_pipeline():
"parser, ner. For more information, see the docs on processing pipelines.", "parser, ner. For more information, see the docs on processing pipelines.",
title="Enter your model's pipeline components") title="Enter your model's pipeline components")
pipeline = util.get_raw_input("Pipeline components", True) pipeline = util.get_raw_input("Pipeline components", True)
replace = {'True': True, 'False': False} subs = {'True': True, 'False': False}
return replace[pipeline] if pipeline in replace else pipeline.split(', ') if pipeline in subs:
return subs[pipeline]
else:
return [p.strip() for p in pipeline.split(',')]
def validate_meta(meta, keys): def validate_meta(meta, keys):

View File

@ -533,7 +533,7 @@ cdef class Parser:
states, golds, max_steps = self._init_gold_batch(docs, golds) states, golds, max_steps = self._init_gold_batch(docs, golds)
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
0.0) drop)
todo = [(s, g) for (s, g) in zip(states, golds) todo = [(s, g) for (s, g) in zip(states, golds)
if not s.is_final() and g is not None] if not s.is_final() and g is not None]
if not todo: if not todo:
@ -598,7 +598,7 @@ cdef class Parser:
self.moves.preprocess_gold(gold) self.moves.preprocess_gold(gold)
cuda_stream = get_cuda_stream() cuda_stream = get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, 0.0) (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop)
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500, states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
states, golds, states, golds,
@ -685,7 +685,7 @@ cdef class Parser:
tok2vec, lower, upper = self.model tok2vec, lower, upper = self.model
tokvecs, bp_tokvecs = tok2vec.begin_update(docs, drop=dropout) tokvecs, bp_tokvecs = tok2vec.begin_update(docs, drop=dropout)
state2vec = precompute_hiddens(len(docs), tokvecs, state2vec = precompute_hiddens(len(docs), tokvecs,
lower, stream, drop=dropout) lower, stream, drop=0.0)
return (tokvecs, bp_tokvecs), state2vec, upper return (tokvecs, bp_tokvecs), state2vec, upper
nr_feature = 8 nr_feature = 8

View File

@ -181,9 +181,10 @@ def is_package(name):
name (unicode): Name of package. name (unicode): Name of package.
RETURNS (bool): True if installed package, False if not. RETURNS (bool): True if installed package, False if not.
""" """
name = name.lower() # compare package name against lowercase name
packages = pkg_resources.working_set.by_key.keys() packages = pkg_resources.working_set.by_key.keys()
for package in packages: for package in packages:
if package.replace('-', '_') == name: if package.lower().replace('-', '_') == name:
return True return True
return False return False
@ -194,6 +195,7 @@ def get_package_path(name):
name (unicode): Package name. name (unicode): Package name.
RETURNS (Path): Path to installed package. RETURNS (Path): Path to installed package.
""" """
name = name.lower() # use lowercase version to be safe
# Here we're importing the module just to find it. This is worryingly # Here we're importing the module just to find it. This is worryingly
# indirect, but it's otherwise very difficult to find the package. # indirect, but it's otherwise very difficult to find the package.
pkg = importlib.import_module(name) pkg = importlib.import_module(name)

View File

@ -262,7 +262,7 @@ cdef class Vocab:
Words can be looked up by string or int ID. Words can be looked up by string or int ID.
RETURNS: RETURNS:
A word vector. Size and shape determed by the A word vector. Size and shape determined by the
vocab.vectors instance. Usually, a numpy ndarray vocab.vectors instance. Usually, a numpy ndarray
of shape (300,) and dtype float32. of shape (300,) and dtype float32.
@ -324,6 +324,7 @@ cdef class Vocab:
self.lexemes_from_bytes(file_.read()) self.lexemes_from_bytes(file_.read())
if self.vectors is not None: if self.vectors is not None:
self.vectors.from_disk(path, exclude='strings.json') self.vectors.from_disk(path, exclude='strings.json')
link_vectors_to_models(self)
return self return self
def to_bytes(self, **exclude): def to_bytes(self, **exclude):