Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2018-12-01 14:43:29 +01:00
commit 40a273245c
8 changed files with 35 additions and 68 deletions

View File

@ -1,4 +1,3 @@
cython>=0.25
numpy>=1.15.0
cymem>=2.0.2,<2.1.0
preshed>=2.0.1,<2.1.0
@ -15,6 +14,7 @@ jsonschema>=2.6.0,<3.0.0
wasabi>=0.0.8,<1.1.0
pathlib==1.0.1; python_version < "3.4"
# Development dependencies
cython>=0.25
pytest>=4.0.0,<5.0.0
pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0

View File

@ -7,7 +7,7 @@ from pathlib import Path
from wasabi import Printer
from ._messages import Messages
from ..compat import path2str
from ..compat import path2str, basestring_, unicode_
from .. import util
from .. import about
@ -44,7 +44,7 @@ def info(model=None, markdown=False, silent=False):
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
}
if markdown:
util.print_markdown(model_meta, title=title)
print_markdown(model_meta, title=title)
else:
msg.table(model_meta, title=title)
return meta
@ -58,7 +58,7 @@ def info(model=None, markdown=False, silent=False):
if not silent:
title = "Info about spaCy"
if markdown:
util.print_markdown(data, title=title)
print_markdown(data, title=title)
else:
msg.table(data, title=title)
return data
@ -75,3 +75,19 @@ def list_models():
models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
return ", ".join([m for m in models if not exclude_dir(m)])
return "-"
def print_markdown(data, title=None):
"""Print data in GitHub-flavoured Markdown format for issues etc.
data (dict or list of tuples): Label/value pairs.
title (unicode or None): Title, will be rendered as headline 2.
"""
markdown = []
for key, value in data.items():
if isinstance(value, basestring_) and Path(value).exists():
continue
markdown.append("* **{}:** {}".format(key, unicode_(value)))
if title:
print("\n## {}".format(title))
print("\n{}\n".format("\n".join(markdown)))

View File

@ -160,7 +160,7 @@ def make_update(model, docs, optimizer, drop=0.0):
return loss
def make_docs(nlp, batch):
def make_docs(nlp, batch, min_length=1, max_length=500):
docs = []
for record in batch:
text = record["text"]
@ -173,7 +173,7 @@ def make_docs(nlp, batch):
heads = numpy.asarray(heads, dtype="uint64")
heads = heads.reshape((len(doc), 1))
doc = doc.from_array([HEAD], heads)
if len(doc) >= 1 and len(doc) < 200:
if len(doc) >= min_length and len(doc) < max_length:
docs.append(doc)
return docs

View File

@ -22,13 +22,13 @@ from .. import about
# Batch size starts at 1 and grows, so that we make updates quickly
# at the beginning of training.
dropout_rates = util.decaying(
util.env_opt("dropout_from", 0.2),
util.env_opt("dropout_to", 0.2),
util.env_opt("dropout_from", 0.1),
util.env_opt("dropout_to", 0.1),
util.env_opt("dropout_decay", 0.0),
)
batch_sizes = util.compounding(
util.env_opt("batch_from", 1000),
util.env_opt("batch_to", 1000),
util.env_opt("batch_from", 750),
util.env_opt("batch_to", 750),
util.env_opt("batch_compound", 1.001),
)
@ -144,24 +144,6 @@ def train(
if learn_tokens:
nlp.add_pipe(nlp.create_pipe("merge_subtokens"))
# Take dropout and batch size as generators of values -- dropout
# starts high and decays sharply, to force the optimizer to explore.
# Batch size starts at 1 and grows, so that we make updates quickly
# at the beginning of training.
dropout_rates = util.decaying(
util.env_opt("dropout_from", 0.1),
util.env_opt("dropout_to", 0.1),
util.env_opt("dropout_decay", 0.0),
)
batch_sizes = util.compounding(
util.env_opt("batch_from", 750),
util.env_opt("batch_to", 750),
util.env_opt("batch_compound", 1.001),
)
lang_class = util.get_lang_class(lang)
nlp = lang_class()
meta["pipeline"] = pipeline
nlp.meta.update(meta)
if vectors:
msg.text(Messages.M058.format(model=vectors))
_load_vectors(nlp, vectors)
@ -187,6 +169,7 @@ def train(
else:
# Start with a blank model, call begin_training
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
nlp._optimizer = None
# Load in pre-trained weights

View File

@ -265,7 +265,7 @@ def _corrupt(c, noise_level):
return '\n'
elif c == '\n':
return ' '
elif c in ['.', "'", "!", "?"]:
elif c in ['.', "'", "!", "?", ',']:
return ''
else:
return c.lower()

View File

@ -13,7 +13,8 @@ from .util import msgpack
from .util import msgpack_numpy
from thinc.api import chain
from thinc.v2v import Affine, SELU, Softmax
from thinc.v2v import Affine, Maxout, Softmax
from thinc.misc import LayerNorm
from thinc.t2v import Pooling, max_pool, mean_pool
from thinc.neural.util import to_categorical, copy_array
from thinc.neural._classes.difference import Siamese, CauchySimilarity
@ -442,7 +443,7 @@ class Tensorizer(Pipe):
**cfg: Config parameters.
RETURNS (Model): A `thinc.neural.Model` or similar instance.
"""
input_size = util.env_opt('token_vector_width', cfg.get('input_size', 128))
input_size = util.env_opt('token_vector_width', cfg.get('input_size', 96))
return zero_init(Affine(output_size, input_size, drop_factor=0.0))
def __init__(self, vocab, model=True, **cfg):
@ -879,9 +880,10 @@ class MultitaskObjective(Tagger):
@classmethod
def Model(cls, n_tags, tok2vec=None, **cfg):
token_vector_width = util.env_opt('token_vector_width', 96)
softmax = Softmax(n_tags, token_vector_width)
softmax = Softmax(n_tags, token_vector_width*2)
model = chain(
tok2vec,
LayerNorm(Maxout(token_vector_width*2, token_vector_width, pieces=3)),
softmax
)
model.tok2vec = tok2vec

View File

@ -510,7 +510,7 @@ cdef class Parser:
for action, label_freqs in previous_labels.items():
for label in label_freqs:
self.moves.add_action(action, label)
cfg.setdefault('token_vector_width', 128)
cfg.setdefault('token_vector_width', 96)
if self.model is True:
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
if sgd is None:

View File

@ -18,7 +18,7 @@ import numpy.random
from .symbols import ORTH
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
from .compat import cupy, CudaStream, path2str, basestring_, unicode_
from .compat import import_file, json_dumps
from .errors import Errors
@ -587,19 +587,6 @@ def is_json_serializable(obj):
return False
def get_raw_input(description, default=False):
"""Get user input from the command line via raw_input / input.
description (unicode): Text to display before prompt.
default (unicode or False/None): Default value to display with prompt.
RETURNS (unicode): User input.
"""
additional = " (default: %s)" % default if default else ""
prompt = " %s%s: " % (description, additional)
user_input = input_(prompt)
return user_input
def to_bytes(getters, exclude):
serialized = OrderedDict()
for key, getter in getters.items():
@ -634,27 +621,6 @@ def from_disk(path, readers, exclude):
return path
def print_markdown(data, title=None):
"""Print data in GitHub-flavoured Markdown format for issues etc.
data (dict or list of tuples): Label/value pairs.
title (unicode or None): Title, will be rendered as headline 2.
"""
def excl_value(value):
# contains path, i.e. personal info
return isinstance(value, basestring_) and Path(value).exists()
if isinstance(data, dict):
data = list(data.items())
markdown = [
"* **{}:** {}".format(l, unicode_(v)) for l, v in data if not excl_value(v)
]
if title:
print("\n## {}".format(title))
print("\n{}\n".format("\n".join(markdown)))
def minify_html(html):
"""Perform a template-specific, rudimentary HTML minification for displaCy.
Disclaimer: NOT a general-purpose solution, only removes indentation and