diff --git a/requirements.txt b/requirements.txt index c6d43ddd7..a3404b0f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -cython>=0.25 numpy>=1.15.0 cymem>=2.0.2,<2.1.0 preshed>=2.0.1,<2.1.0 @@ -15,6 +14,7 @@ jsonschema>=2.6.0,<3.0.0 wasabi>=0.0.8,<1.1.0 pathlib==1.0.1; python_version < "3.4" # Development dependencies +cython>=0.25 pytest>=4.0.0,<5.0.0 pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 90387f9f7..5df9ddadb 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -7,7 +7,7 @@ from pathlib import Path from wasabi import Printer from ._messages import Messages -from ..compat import path2str +from ..compat import path2str, basestring_, unicode_ from .. import util from .. import about @@ -44,7 +44,7 @@ def info(model=None, markdown=False, silent=False): k: v for k, v in meta.items() if k not in ("accuracy", "speed") } if markdown: - util.print_markdown(model_meta, title=title) + print_markdown(model_meta, title=title) else: msg.table(model_meta, title=title) return meta @@ -58,7 +58,7 @@ def info(model=None, markdown=False, silent=False): if not silent: title = "Info about spaCy" if markdown: - util.print_markdown(data, title=title) + print_markdown(data, title=title) else: msg.table(data, title=title) return data @@ -75,3 +75,19 @@ def list_models(): models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()] return ", ".join([m for m in models if not exclude_dir(m)]) return "-" + + +def print_markdown(data, title=None): + """Print data in GitHub-flavoured Markdown format for issues etc. + + data (dict or list of tuples): Label/value pairs. + title (unicode or None): Title, will be rendered as headline 2. + """ + markdown = [] + for key, value in data.items(): + if isinstance(value, basestring_) and Path(value).exists(): + continue + markdown.append("* **{}:** {}".format(key, unicode_(value))) + if title: + print("\n## {}".format(title)) + print("\n{}\n".format("\n".join(markdown))) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 80e60a871..20d097047 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -160,7 +160,7 @@ def make_update(model, docs, optimizer, drop=0.0): return loss -def make_docs(nlp, batch): +def make_docs(nlp, batch, min_length=1, max_length=500): docs = [] for record in batch: text = record["text"] @@ -173,7 +173,7 @@ def make_docs(nlp, batch): heads = numpy.asarray(heads, dtype="uint64") heads = heads.reshape((len(doc), 1)) doc = doc.from_array([HEAD], heads) - if len(doc) >= 1 and len(doc) < 200: + if len(doc) >= min_length and len(doc) < max_length: docs.append(doc) return docs diff --git a/spacy/cli/train.py b/spacy/cli/train.py index d49b94e44..9dec5d4bd 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -22,13 +22,13 @@ from .. import about # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( - util.env_opt("dropout_from", 0.2), - util.env_opt("dropout_to", 0.2), + util.env_opt("dropout_from", 0.1), + util.env_opt("dropout_to", 0.1), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( - util.env_opt("batch_from", 1000), - util.env_opt("batch_to", 1000), + util.env_opt("batch_from", 750), + util.env_opt("batch_to", 750), util.env_opt("batch_compound", 1.001), ) @@ -144,24 +144,6 @@ def train( if learn_tokens: nlp.add_pipe(nlp.create_pipe("merge_subtokens")) - # Take dropout and batch size as generators of values -- dropout - # starts high and decays sharply, to force the optimizer to explore. - # Batch size starts at 1 and grows, so that we make updates quickly - # at the beginning of training. - dropout_rates = util.decaying( - util.env_opt("dropout_from", 0.1), - util.env_opt("dropout_to", 0.1), - util.env_opt("dropout_decay", 0.0), - ) - batch_sizes = util.compounding( - util.env_opt("batch_from", 750), - util.env_opt("batch_to", 750), - util.env_opt("batch_compound", 1.001), - ) - lang_class = util.get_lang_class(lang) - nlp = lang_class() - meta["pipeline"] = pipeline - nlp.meta.update(meta) if vectors: msg.text(Messages.M058.format(model=vectors)) _load_vectors(nlp, vectors) @@ -187,6 +169,7 @@ def train( else: # Start with a blank model, call begin_training optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) + nlp._optimizer = None # Load in pre-trained weights diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 0f25d7f53..26ff9753a 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -265,7 +265,7 @@ def _corrupt(c, noise_level): return '\n' elif c == '\n': return ' ' - elif c in ['.', "'", "!", "?"]: + elif c in ['.', "'", "!", "?", ',']: return '' else: return c.lower() diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 62d958ef2..3a09af644 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -13,7 +13,8 @@ from .util import msgpack from .util import msgpack_numpy from thinc.api import chain -from thinc.v2v import Affine, SELU, Softmax +from thinc.v2v import Affine, Maxout, Softmax +from thinc.misc import LayerNorm from thinc.t2v import Pooling, max_pool, mean_pool from thinc.neural.util import to_categorical, copy_array from thinc.neural._classes.difference import Siamese, CauchySimilarity @@ -442,7 +443,7 @@ class Tensorizer(Pipe): **cfg: Config parameters. RETURNS (Model): A `thinc.neural.Model` or similar instance. """ - input_size = util.env_opt('token_vector_width', cfg.get('input_size', 128)) + input_size = util.env_opt('token_vector_width', cfg.get('input_size', 96)) return zero_init(Affine(output_size, input_size, drop_factor=0.0)) def __init__(self, vocab, model=True, **cfg): @@ -879,9 +880,10 @@ class MultitaskObjective(Tagger): @classmethod def Model(cls, n_tags, tok2vec=None, **cfg): token_vector_width = util.env_opt('token_vector_width', 96) - softmax = Softmax(n_tags, token_vector_width) + softmax = Softmax(n_tags, token_vector_width*2) model = chain( tok2vec, + LayerNorm(Maxout(token_vector_width*2, token_vector_width, pieces=3)), softmax ) model.tok2vec = tok2vec diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 6e949fc35..0663c1289 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -510,7 +510,7 @@ cdef class Parser: for action, label_freqs in previous_labels.items(): for label in label_freqs: self.moves.add_action(action, label) - cfg.setdefault('token_vector_width', 128) + cfg.setdefault('token_vector_width', 96) if self.model is True: self.model, cfg = self.Model(self.moves.n_moves, **cfg) if sgd is None: diff --git a/spacy/util.py b/spacy/util.py index a070e3045..d8c82da89 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -18,7 +18,7 @@ import numpy.random from .symbols import ORTH -from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_ +from .compat import cupy, CudaStream, path2str, basestring_, unicode_ from .compat import import_file, json_dumps from .errors import Errors @@ -587,19 +587,6 @@ def is_json_serializable(obj): return False -def get_raw_input(description, default=False): - """Get user input from the command line via raw_input / input. - - description (unicode): Text to display before prompt. - default (unicode or False/None): Default value to display with prompt. - RETURNS (unicode): User input. - """ - additional = " (default: %s)" % default if default else "" - prompt = " %s%s: " % (description, additional) - user_input = input_(prompt) - return user_input - - def to_bytes(getters, exclude): serialized = OrderedDict() for key, getter in getters.items(): @@ -634,27 +621,6 @@ def from_disk(path, readers, exclude): return path -def print_markdown(data, title=None): - """Print data in GitHub-flavoured Markdown format for issues etc. - - data (dict or list of tuples): Label/value pairs. - title (unicode or None): Title, will be rendered as headline 2. - """ - - def excl_value(value): - # contains path, i.e. personal info - return isinstance(value, basestring_) and Path(value).exists() - - if isinstance(data, dict): - data = list(data.items()) - markdown = [ - "* **{}:** {}".format(l, unicode_(v)) for l, v in data if not excl_value(v) - ] - if title: - print("\n## {}".format(title)) - print("\n{}\n".format("\n".join(markdown))) - - def minify_html(html): """Perform a template-specific, rudimentary HTML minification for displaCy. Disclaimer: NOT a general-purpose solution, only removes indentation and