mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 21:24:11 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
40a273245c
|
@ -1,4 +1,3 @@
|
||||||
cython>=0.25
|
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=2.0.1,<2.1.0
|
preshed>=2.0.1,<2.1.0
|
||||||
|
@ -15,6 +14,7 @@ jsonschema>=2.6.0,<3.0.0
|
||||||
wasabi>=0.0.8,<1.1.0
|
wasabi>=0.0.8,<1.1.0
|
||||||
pathlib==1.0.1; python_version < "3.4"
|
pathlib==1.0.1; python_version < "3.4"
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
|
cython>=0.25
|
||||||
pytest>=4.0.0,<5.0.0
|
pytest>=4.0.0,<5.0.0
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
|
|
|
@ -7,7 +7,7 @@ from pathlib import Path
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..compat import path2str
|
from ..compat import path2str, basestring_, unicode_
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ def info(model=None, markdown=False, silent=False):
|
||||||
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
|
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
|
||||||
}
|
}
|
||||||
if markdown:
|
if markdown:
|
||||||
util.print_markdown(model_meta, title=title)
|
print_markdown(model_meta, title=title)
|
||||||
else:
|
else:
|
||||||
msg.table(model_meta, title=title)
|
msg.table(model_meta, title=title)
|
||||||
return meta
|
return meta
|
||||||
|
@ -58,7 +58,7 @@ def info(model=None, markdown=False, silent=False):
|
||||||
if not silent:
|
if not silent:
|
||||||
title = "Info about spaCy"
|
title = "Info about spaCy"
|
||||||
if markdown:
|
if markdown:
|
||||||
util.print_markdown(data, title=title)
|
print_markdown(data, title=title)
|
||||||
else:
|
else:
|
||||||
msg.table(data, title=title)
|
msg.table(data, title=title)
|
||||||
return data
|
return data
|
||||||
|
@ -75,3 +75,19 @@ def list_models():
|
||||||
models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
|
models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
|
||||||
return ", ".join([m for m in models if not exclude_dir(m)])
|
return ", ".join([m for m in models if not exclude_dir(m)])
|
||||||
return "-"
|
return "-"
|
||||||
|
|
||||||
|
|
||||||
|
def print_markdown(data, title=None):
|
||||||
|
"""Print data in GitHub-flavoured Markdown format for issues etc.
|
||||||
|
|
||||||
|
data (dict or list of tuples): Label/value pairs.
|
||||||
|
title (unicode or None): Title, will be rendered as headline 2.
|
||||||
|
"""
|
||||||
|
markdown = []
|
||||||
|
for key, value in data.items():
|
||||||
|
if isinstance(value, basestring_) and Path(value).exists():
|
||||||
|
continue
|
||||||
|
markdown.append("* **{}:** {}".format(key, unicode_(value)))
|
||||||
|
if title:
|
||||||
|
print("\n## {}".format(title))
|
||||||
|
print("\n{}\n".format("\n".join(markdown)))
|
||||||
|
|
|
@ -160,7 +160,7 @@ def make_update(model, docs, optimizer, drop=0.0):
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
|
|
||||||
def make_docs(nlp, batch):
|
def make_docs(nlp, batch, min_length=1, max_length=500):
|
||||||
docs = []
|
docs = []
|
||||||
for record in batch:
|
for record in batch:
|
||||||
text = record["text"]
|
text = record["text"]
|
||||||
|
@ -173,7 +173,7 @@ def make_docs(nlp, batch):
|
||||||
heads = numpy.asarray(heads, dtype="uint64")
|
heads = numpy.asarray(heads, dtype="uint64")
|
||||||
heads = heads.reshape((len(doc), 1))
|
heads = heads.reshape((len(doc), 1))
|
||||||
doc = doc.from_array([HEAD], heads)
|
doc = doc.from_array([HEAD], heads)
|
||||||
if len(doc) >= 1 and len(doc) < 200:
|
if len(doc) >= min_length and len(doc) < max_length:
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
|
|
@ -22,13 +22,13 @@ from .. import about
|
||||||
# Batch size starts at 1 and grows, so that we make updates quickly
|
# Batch size starts at 1 and grows, so that we make updates quickly
|
||||||
# at the beginning of training.
|
# at the beginning of training.
|
||||||
dropout_rates = util.decaying(
|
dropout_rates = util.decaying(
|
||||||
util.env_opt("dropout_from", 0.2),
|
util.env_opt("dropout_from", 0.1),
|
||||||
util.env_opt("dropout_to", 0.2),
|
util.env_opt("dropout_to", 0.1),
|
||||||
util.env_opt("dropout_decay", 0.0),
|
util.env_opt("dropout_decay", 0.0),
|
||||||
)
|
)
|
||||||
batch_sizes = util.compounding(
|
batch_sizes = util.compounding(
|
||||||
util.env_opt("batch_from", 1000),
|
util.env_opt("batch_from", 750),
|
||||||
util.env_opt("batch_to", 1000),
|
util.env_opt("batch_to", 750),
|
||||||
util.env_opt("batch_compound", 1.001),
|
util.env_opt("batch_compound", 1.001),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -144,24 +144,6 @@ def train(
|
||||||
if learn_tokens:
|
if learn_tokens:
|
||||||
nlp.add_pipe(nlp.create_pipe("merge_subtokens"))
|
nlp.add_pipe(nlp.create_pipe("merge_subtokens"))
|
||||||
|
|
||||||
# Take dropout and batch size as generators of values -- dropout
|
|
||||||
# starts high and decays sharply, to force the optimizer to explore.
|
|
||||||
# Batch size starts at 1 and grows, so that we make updates quickly
|
|
||||||
# at the beginning of training.
|
|
||||||
dropout_rates = util.decaying(
|
|
||||||
util.env_opt("dropout_from", 0.1),
|
|
||||||
util.env_opt("dropout_to", 0.1),
|
|
||||||
util.env_opt("dropout_decay", 0.0),
|
|
||||||
)
|
|
||||||
batch_sizes = util.compounding(
|
|
||||||
util.env_opt("batch_from", 750),
|
|
||||||
util.env_opt("batch_to", 750),
|
|
||||||
util.env_opt("batch_compound", 1.001),
|
|
||||||
)
|
|
||||||
lang_class = util.get_lang_class(lang)
|
|
||||||
nlp = lang_class()
|
|
||||||
meta["pipeline"] = pipeline
|
|
||||||
nlp.meta.update(meta)
|
|
||||||
if vectors:
|
if vectors:
|
||||||
msg.text(Messages.M058.format(model=vectors))
|
msg.text(Messages.M058.format(model=vectors))
|
||||||
_load_vectors(nlp, vectors)
|
_load_vectors(nlp, vectors)
|
||||||
|
@ -187,6 +169,7 @@ def train(
|
||||||
else:
|
else:
|
||||||
# Start with a blank model, call begin_training
|
# Start with a blank model, call begin_training
|
||||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||||
|
|
||||||
nlp._optimizer = None
|
nlp._optimizer = None
|
||||||
|
|
||||||
# Load in pre-trained weights
|
# Load in pre-trained weights
|
||||||
|
|
|
@ -265,7 +265,7 @@ def _corrupt(c, noise_level):
|
||||||
return '\n'
|
return '\n'
|
||||||
elif c == '\n':
|
elif c == '\n':
|
||||||
return ' '
|
return ' '
|
||||||
elif c in ['.', "'", "!", "?"]:
|
elif c in ['.', "'", "!", "?", ',']:
|
||||||
return ''
|
return ''
|
||||||
else:
|
else:
|
||||||
return c.lower()
|
return c.lower()
|
||||||
|
|
|
@ -13,7 +13,8 @@ from .util import msgpack
|
||||||
from .util import msgpack_numpy
|
from .util import msgpack_numpy
|
||||||
|
|
||||||
from thinc.api import chain
|
from thinc.api import chain
|
||||||
from thinc.v2v import Affine, SELU, Softmax
|
from thinc.v2v import Affine, Maxout, Softmax
|
||||||
|
from thinc.misc import LayerNorm
|
||||||
from thinc.t2v import Pooling, max_pool, mean_pool
|
from thinc.t2v import Pooling, max_pool, mean_pool
|
||||||
from thinc.neural.util import to_categorical, copy_array
|
from thinc.neural.util import to_categorical, copy_array
|
||||||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||||
|
@ -442,7 +443,7 @@ class Tensorizer(Pipe):
|
||||||
**cfg: Config parameters.
|
**cfg: Config parameters.
|
||||||
RETURNS (Model): A `thinc.neural.Model` or similar instance.
|
RETURNS (Model): A `thinc.neural.Model` or similar instance.
|
||||||
"""
|
"""
|
||||||
input_size = util.env_opt('token_vector_width', cfg.get('input_size', 128))
|
input_size = util.env_opt('token_vector_width', cfg.get('input_size', 96))
|
||||||
return zero_init(Affine(output_size, input_size, drop_factor=0.0))
|
return zero_init(Affine(output_size, input_size, drop_factor=0.0))
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
|
@ -879,9 +880,10 @@ class MultitaskObjective(Tagger):
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, n_tags, tok2vec=None, **cfg):
|
def Model(cls, n_tags, tok2vec=None, **cfg):
|
||||||
token_vector_width = util.env_opt('token_vector_width', 96)
|
token_vector_width = util.env_opt('token_vector_width', 96)
|
||||||
softmax = Softmax(n_tags, token_vector_width)
|
softmax = Softmax(n_tags, token_vector_width*2)
|
||||||
model = chain(
|
model = chain(
|
||||||
tok2vec,
|
tok2vec,
|
||||||
|
LayerNorm(Maxout(token_vector_width*2, token_vector_width, pieces=3)),
|
||||||
softmax
|
softmax
|
||||||
)
|
)
|
||||||
model.tok2vec = tok2vec
|
model.tok2vec = tok2vec
|
||||||
|
|
|
@ -510,7 +510,7 @@ cdef class Parser:
|
||||||
for action, label_freqs in previous_labels.items():
|
for action, label_freqs in previous_labels.items():
|
||||||
for label in label_freqs:
|
for label in label_freqs:
|
||||||
self.moves.add_action(action, label)
|
self.moves.add_action(action, label)
|
||||||
cfg.setdefault('token_vector_width', 128)
|
cfg.setdefault('token_vector_width', 96)
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
|
|
|
@ -18,7 +18,7 @@ import numpy.random
|
||||||
|
|
||||||
|
|
||||||
from .symbols import ORTH
|
from .symbols import ORTH
|
||||||
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
|
from .compat import cupy, CudaStream, path2str, basestring_, unicode_
|
||||||
from .compat import import_file, json_dumps
|
from .compat import import_file, json_dumps
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
|
|
||||||
|
@ -587,19 +587,6 @@ def is_json_serializable(obj):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_raw_input(description, default=False):
|
|
||||||
"""Get user input from the command line via raw_input / input.
|
|
||||||
|
|
||||||
description (unicode): Text to display before prompt.
|
|
||||||
default (unicode or False/None): Default value to display with prompt.
|
|
||||||
RETURNS (unicode): User input.
|
|
||||||
"""
|
|
||||||
additional = " (default: %s)" % default if default else ""
|
|
||||||
prompt = " %s%s: " % (description, additional)
|
|
||||||
user_input = input_(prompt)
|
|
||||||
return user_input
|
|
||||||
|
|
||||||
|
|
||||||
def to_bytes(getters, exclude):
|
def to_bytes(getters, exclude):
|
||||||
serialized = OrderedDict()
|
serialized = OrderedDict()
|
||||||
for key, getter in getters.items():
|
for key, getter in getters.items():
|
||||||
|
@ -634,27 +621,6 @@ def from_disk(path, readers, exclude):
|
||||||
return path
|
return path
|
||||||
|
|
||||||
|
|
||||||
def print_markdown(data, title=None):
|
|
||||||
"""Print data in GitHub-flavoured Markdown format for issues etc.
|
|
||||||
|
|
||||||
data (dict or list of tuples): Label/value pairs.
|
|
||||||
title (unicode or None): Title, will be rendered as headline 2.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def excl_value(value):
|
|
||||||
# contains path, i.e. personal info
|
|
||||||
return isinstance(value, basestring_) and Path(value).exists()
|
|
||||||
|
|
||||||
if isinstance(data, dict):
|
|
||||||
data = list(data.items())
|
|
||||||
markdown = [
|
|
||||||
"* **{}:** {}".format(l, unicode_(v)) for l, v in data if not excl_value(v)
|
|
||||||
]
|
|
||||||
if title:
|
|
||||||
print("\n## {}".format(title))
|
|
||||||
print("\n{}\n".format("\n".join(markdown)))
|
|
||||||
|
|
||||||
|
|
||||||
def minify_html(html):
|
def minify_html(html):
|
||||||
"""Perform a template-specific, rudimentary HTML minification for displaCy.
|
"""Perform a template-specific, rudimentary HTML minification for displaCy.
|
||||||
Disclaimer: NOT a general-purpose solution, only removes indentation and
|
Disclaimer: NOT a general-purpose solution, only removes indentation and
|
||||||
|
|
Loading…
Reference in New Issue
Block a user