mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
40a273245c
|
@ -1,4 +1,3 @@
|
|||
cython>=0.25
|
||||
numpy>=1.15.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=2.0.1,<2.1.0
|
||||
|
@ -15,6 +14,7 @@ jsonschema>=2.6.0,<3.0.0
|
|||
wasabi>=0.0.8,<1.1.0
|
||||
pathlib==1.0.1; python_version < "3.4"
|
||||
# Development dependencies
|
||||
cython>=0.25
|
||||
pytest>=4.0.0,<5.0.0
|
||||
pytest-timeout>=1.3.0,<2.0.0
|
||||
mock>=2.0.0,<3.0.0
|
||||
|
|
|
@ -7,7 +7,7 @@ from pathlib import Path
|
|||
from wasabi import Printer
|
||||
|
||||
from ._messages import Messages
|
||||
from ..compat import path2str
|
||||
from ..compat import path2str, basestring_, unicode_
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
@ -44,7 +44,7 @@ def info(model=None, markdown=False, silent=False):
|
|||
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
|
||||
}
|
||||
if markdown:
|
||||
util.print_markdown(model_meta, title=title)
|
||||
print_markdown(model_meta, title=title)
|
||||
else:
|
||||
msg.table(model_meta, title=title)
|
||||
return meta
|
||||
|
@ -58,7 +58,7 @@ def info(model=None, markdown=False, silent=False):
|
|||
if not silent:
|
||||
title = "Info about spaCy"
|
||||
if markdown:
|
||||
util.print_markdown(data, title=title)
|
||||
print_markdown(data, title=title)
|
||||
else:
|
||||
msg.table(data, title=title)
|
||||
return data
|
||||
|
@ -75,3 +75,19 @@ def list_models():
|
|||
models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
|
||||
return ", ".join([m for m in models if not exclude_dir(m)])
|
||||
return "-"
|
||||
|
||||
|
||||
def print_markdown(data, title=None):
|
||||
"""Print data in GitHub-flavoured Markdown format for issues etc.
|
||||
|
||||
data (dict or list of tuples): Label/value pairs.
|
||||
title (unicode or None): Title, will be rendered as headline 2.
|
||||
"""
|
||||
markdown = []
|
||||
for key, value in data.items():
|
||||
if isinstance(value, basestring_) and Path(value).exists():
|
||||
continue
|
||||
markdown.append("* **{}:** {}".format(key, unicode_(value)))
|
||||
if title:
|
||||
print("\n## {}".format(title))
|
||||
print("\n{}\n".format("\n".join(markdown)))
|
||||
|
|
|
@ -160,7 +160,7 @@ def make_update(model, docs, optimizer, drop=0.0):
|
|||
return loss
|
||||
|
||||
|
||||
def make_docs(nlp, batch):
|
||||
def make_docs(nlp, batch, min_length=1, max_length=500):
|
||||
docs = []
|
||||
for record in batch:
|
||||
text = record["text"]
|
||||
|
@ -173,7 +173,7 @@ def make_docs(nlp, batch):
|
|||
heads = numpy.asarray(heads, dtype="uint64")
|
||||
heads = heads.reshape((len(doc), 1))
|
||||
doc = doc.from_array([HEAD], heads)
|
||||
if len(doc) >= 1 and len(doc) < 200:
|
||||
if len(doc) >= min_length and len(doc) < max_length:
|
||||
docs.append(doc)
|
||||
return docs
|
||||
|
||||
|
|
|
@ -22,13 +22,13 @@ from .. import about
|
|||
# Batch size starts at 1 and grows, so that we make updates quickly
|
||||
# at the beginning of training.
|
||||
dropout_rates = util.decaying(
|
||||
util.env_opt("dropout_from", 0.2),
|
||||
util.env_opt("dropout_to", 0.2),
|
||||
util.env_opt("dropout_from", 0.1),
|
||||
util.env_opt("dropout_to", 0.1),
|
||||
util.env_opt("dropout_decay", 0.0),
|
||||
)
|
||||
batch_sizes = util.compounding(
|
||||
util.env_opt("batch_from", 1000),
|
||||
util.env_opt("batch_to", 1000),
|
||||
util.env_opt("batch_from", 750),
|
||||
util.env_opt("batch_to", 750),
|
||||
util.env_opt("batch_compound", 1.001),
|
||||
)
|
||||
|
||||
|
@ -144,24 +144,6 @@ def train(
|
|||
if learn_tokens:
|
||||
nlp.add_pipe(nlp.create_pipe("merge_subtokens"))
|
||||
|
||||
# Take dropout and batch size as generators of values -- dropout
|
||||
# starts high and decays sharply, to force the optimizer to explore.
|
||||
# Batch size starts at 1 and grows, so that we make updates quickly
|
||||
# at the beginning of training.
|
||||
dropout_rates = util.decaying(
|
||||
util.env_opt("dropout_from", 0.1),
|
||||
util.env_opt("dropout_to", 0.1),
|
||||
util.env_opt("dropout_decay", 0.0),
|
||||
)
|
||||
batch_sizes = util.compounding(
|
||||
util.env_opt("batch_from", 750),
|
||||
util.env_opt("batch_to", 750),
|
||||
util.env_opt("batch_compound", 1.001),
|
||||
)
|
||||
lang_class = util.get_lang_class(lang)
|
||||
nlp = lang_class()
|
||||
meta["pipeline"] = pipeline
|
||||
nlp.meta.update(meta)
|
||||
if vectors:
|
||||
msg.text(Messages.M058.format(model=vectors))
|
||||
_load_vectors(nlp, vectors)
|
||||
|
@ -187,6 +169,7 @@ def train(
|
|||
else:
|
||||
# Start with a blank model, call begin_training
|
||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||
|
||||
nlp._optimizer = None
|
||||
|
||||
# Load in pre-trained weights
|
||||
|
|
|
@ -265,7 +265,7 @@ def _corrupt(c, noise_level):
|
|||
return '\n'
|
||||
elif c == '\n':
|
||||
return ' '
|
||||
elif c in ['.', "'", "!", "?"]:
|
||||
elif c in ['.', "'", "!", "?", ',']:
|
||||
return ''
|
||||
else:
|
||||
return c.lower()
|
||||
|
|
|
@ -13,7 +13,8 @@ from .util import msgpack
|
|||
from .util import msgpack_numpy
|
||||
|
||||
from thinc.api import chain
|
||||
from thinc.v2v import Affine, SELU, Softmax
|
||||
from thinc.v2v import Affine, Maxout, Softmax
|
||||
from thinc.misc import LayerNorm
|
||||
from thinc.t2v import Pooling, max_pool, mean_pool
|
||||
from thinc.neural.util import to_categorical, copy_array
|
||||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||
|
@ -442,7 +443,7 @@ class Tensorizer(Pipe):
|
|||
**cfg: Config parameters.
|
||||
RETURNS (Model): A `thinc.neural.Model` or similar instance.
|
||||
"""
|
||||
input_size = util.env_opt('token_vector_width', cfg.get('input_size', 128))
|
||||
input_size = util.env_opt('token_vector_width', cfg.get('input_size', 96))
|
||||
return zero_init(Affine(output_size, input_size, drop_factor=0.0))
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
|
@ -879,9 +880,10 @@ class MultitaskObjective(Tagger):
|
|||
@classmethod
|
||||
def Model(cls, n_tags, tok2vec=None, **cfg):
|
||||
token_vector_width = util.env_opt('token_vector_width', 96)
|
||||
softmax = Softmax(n_tags, token_vector_width)
|
||||
softmax = Softmax(n_tags, token_vector_width*2)
|
||||
model = chain(
|
||||
tok2vec,
|
||||
LayerNorm(Maxout(token_vector_width*2, token_vector_width, pieces=3)),
|
||||
softmax
|
||||
)
|
||||
model.tok2vec = tok2vec
|
||||
|
|
|
@ -510,7 +510,7 @@ cdef class Parser:
|
|||
for action, label_freqs in previous_labels.items():
|
||||
for label in label_freqs:
|
||||
self.moves.add_action(action, label)
|
||||
cfg.setdefault('token_vector_width', 128)
|
||||
cfg.setdefault('token_vector_width', 96)
|
||||
if self.model is True:
|
||||
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
||||
if sgd is None:
|
||||
|
|
|
@ -18,7 +18,7 @@ import numpy.random
|
|||
|
||||
|
||||
from .symbols import ORTH
|
||||
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
|
||||
from .compat import cupy, CudaStream, path2str, basestring_, unicode_
|
||||
from .compat import import_file, json_dumps
|
||||
from .errors import Errors
|
||||
|
||||
|
@ -587,19 +587,6 @@ def is_json_serializable(obj):
|
|||
return False
|
||||
|
||||
|
||||
def get_raw_input(description, default=False):
|
||||
"""Get user input from the command line via raw_input / input.
|
||||
|
||||
description (unicode): Text to display before prompt.
|
||||
default (unicode or False/None): Default value to display with prompt.
|
||||
RETURNS (unicode): User input.
|
||||
"""
|
||||
additional = " (default: %s)" % default if default else ""
|
||||
prompt = " %s%s: " % (description, additional)
|
||||
user_input = input_(prompt)
|
||||
return user_input
|
||||
|
||||
|
||||
def to_bytes(getters, exclude):
|
||||
serialized = OrderedDict()
|
||||
for key, getter in getters.items():
|
||||
|
@ -634,27 +621,6 @@ def from_disk(path, readers, exclude):
|
|||
return path
|
||||
|
||||
|
||||
def print_markdown(data, title=None):
|
||||
"""Print data in GitHub-flavoured Markdown format for issues etc.
|
||||
|
||||
data (dict or list of tuples): Label/value pairs.
|
||||
title (unicode or None): Title, will be rendered as headline 2.
|
||||
"""
|
||||
|
||||
def excl_value(value):
|
||||
# contains path, i.e. personal info
|
||||
return isinstance(value, basestring_) and Path(value).exists()
|
||||
|
||||
if isinstance(data, dict):
|
||||
data = list(data.items())
|
||||
markdown = [
|
||||
"* **{}:** {}".format(l, unicode_(v)) for l, v in data if not excl_value(v)
|
||||
]
|
||||
if title:
|
||||
print("\n## {}".format(title))
|
||||
print("\n{}\n".format("\n".join(markdown)))
|
||||
|
||||
|
||||
def minify_html(html):
|
||||
"""Perform a template-specific, rudimentary HTML minification for displaCy.
|
||||
Disclaimer: NOT a general-purpose solution, only removes indentation and
|
||||
|
|
Loading…
Reference in New Issue
Block a user