Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-12-22 17:43:13 +03:00 · 2018-12-01 14:43:29 +01:00 · 2018-12-01 14:43:29 +01:00 · 40a273245c
commit 40a273245c
parent a31d557f2d d9d339186b
8 changed files with 35 additions and 68 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,3 @@
 cython>=0.25
 numpy>=1.15.0
 cymem>=2.0.2,<2.1.0
 preshed>=2.0.1,<2.1.0
@ -15,6 +14,7 @@ jsonschema>=2.6.0,<3.0.0
 wasabi>=0.0.8,<1.1.0
 pathlib==1.0.1; python_version < "3.4"
 # Development dependencies
 cython>=0.25
 pytest>=4.0.0,<5.0.0
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -7,7 +7,7 @@ from pathlib import Path
 from wasabi import Printer
 from ._messages import Messages
-from ..compat import path2str
+from ..compat import path2str, basestring_, unicode_
 from .. import util
 from .. import about
@ -44,7 +44,7 @@ def info(model=None, markdown=False, silent=False):
                k: v for k, v in meta.items() if k not in ("accuracy", "speed")
            }
            if markdown:
-                util.print_markdown(model_meta, title=title)
+                print_markdown(model_meta, title=title)
            else:
                msg.table(model_meta, title=title)
        return meta
@ -58,7 +58,7 @@ def info(model=None, markdown=False, silent=False):
    if not silent:
        title = "Info about spaCy"
        if markdown:
-            util.print_markdown(data, title=title)
+            print_markdown(data, title=title)
        else:
            msg.table(data, title=title)
    return data
@ -75,3 +75,19 @@ def list_models():
        models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
        return ", ".join([m for m in models if not exclude_dir(m)])
    return "-"
 def print_markdown(data, title=None):
    """Print data in GitHub-flavoured Markdown format for issues etc.
    data (dict or list of tuples): Label/value pairs.
    title (unicode or None): Title, will be rendered as headline 2.
    """
    markdown = []
    for key, value in data.items():
        if isinstance(value, basestring_) and Path(value).exists():
            continue
        markdown.append("* **{}:** {}".format(key, unicode_(value)))
    if title:
        print("\n## {}".format(title))
    print("\n{}\n".format("\n".join(markdown)))
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -160,7 +160,7 @@ def make_update(model, docs, optimizer, drop=0.0):
    return loss
-def make_docs(nlp, batch):
+def make_docs(nlp, batch, min_length=1, max_length=500):
    docs = []
    for record in batch:
        text = record["text"]
@ -173,7 +173,7 @@ def make_docs(nlp, batch):
            heads = numpy.asarray(heads, dtype="uint64")
            heads = heads.reshape((len(doc), 1))
            doc = doc.from_array([HEAD], heads)
-        if len(doc) >= 1 and len(doc) < 200:
+        if len(doc) >= min_length and len(doc) < max_length:
            docs.append(doc)
    return docs
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -22,13 +22,13 @@ from .. import about
 # Batch size starts at 1 and grows, so that we make updates quickly
 # at the beginning of training.
 dropout_rates = util.decaying(
-    util.env_opt("dropout_from", 0.2),
+    util.env_opt("dropout_from", 0.1),
-    util.env_opt("dropout_to", 0.2),
+    util.env_opt("dropout_to", 0.1),
    util.env_opt("dropout_decay", 0.0),
 )
 batch_sizes = util.compounding(
-    util.env_opt("batch_from", 1000),
+    util.env_opt("batch_from", 750),
-    util.env_opt("batch_to", 1000),
+    util.env_opt("batch_to", 750),
    util.env_opt("batch_compound", 1.001),
 )
@ -144,24 +144,6 @@ def train(
    if learn_tokens:
        nlp.add_pipe(nlp.create_pipe("merge_subtokens"))
    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(
        util.env_opt("dropout_from", 0.1),
        util.env_opt("dropout_to", 0.1),
        util.env_opt("dropout_decay", 0.0),
    )
    batch_sizes = util.compounding(
        util.env_opt("batch_from", 750),
        util.env_opt("batch_to", 750),
        util.env_opt("batch_compound", 1.001),
    )
    lang_class = util.get_lang_class(lang)
    nlp = lang_class()
    meta["pipeline"] = pipeline
    nlp.meta.update(meta)
    if vectors:
        msg.text(Messages.M058.format(model=vectors))
        _load_vectors(nlp, vectors)
@ -187,6 +169,7 @@ def train(
    else:
        # Start with a blank model, call begin_training
        optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
    nlp._optimizer = None
    # Load in pre-trained weights
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -265,7 +265,7 @@ def _corrupt(c, noise_level):
        return '\n'
    elif c == '\n':
        return ' '
-    elif c in ['.', "'", "!", "?"]:
+    elif c in ['.', "'", "!", "?", ',']:
        return ''
    else:
        return c.lower()
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -13,7 +13,8 @@ from .util import msgpack
 from .util import msgpack_numpy
 from thinc.api import chain
-from thinc.v2v import Affine, SELU, Softmax
+from thinc.v2v import Affine, Maxout, Softmax
 from thinc.misc import LayerNorm
 from thinc.t2v import Pooling, max_pool, mean_pool
 from thinc.neural.util import to_categorical, copy_array
 from thinc.neural._classes.difference import Siamese, CauchySimilarity
@ -442,7 +443,7 @@ class Tensorizer(Pipe):
        **cfg: Config parameters.
        RETURNS (Model): A `thinc.neural.Model` or similar instance.
        """
-        input_size = util.env_opt('token_vector_width', cfg.get('input_size', 128))
+        input_size = util.env_opt('token_vector_width', cfg.get('input_size', 96))
        return zero_init(Affine(output_size, input_size, drop_factor=0.0))
    def __init__(self, vocab, model=True, **cfg):
@ -879,9 +880,10 @@ class MultitaskObjective(Tagger):
    @classmethod
    def Model(cls, n_tags, tok2vec=None, **cfg):
        token_vector_width = util.env_opt('token_vector_width', 96)
-        softmax = Softmax(n_tags, token_vector_width)
+        softmax = Softmax(n_tags, token_vector_width*2)
        model = chain(
            tok2vec,
            LayerNorm(Maxout(token_vector_width*2, token_vector_width, pieces=3)),
            softmax
        )
        model.tok2vec = tok2vec
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -510,7 +510,7 @@ cdef class Parser:
        for action, label_freqs in previous_labels.items():
            for label in label_freqs:
                self.moves.add_action(action, label)
-        cfg.setdefault('token_vector_width', 128)
+        cfg.setdefault('token_vector_width', 96)
        if self.model is True:
            self.model, cfg = self.Model(self.moves.n_moves, **cfg)
            if sgd is None:
--- a/spacy/util.py
+++ b/spacy/util.py
@ -18,7 +18,7 @@ import numpy.random
 from .symbols import ORTH
-from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
+from .compat import cupy, CudaStream, path2str, basestring_, unicode_
 from .compat import import_file, json_dumps
 from .errors import Errors
@ -587,19 +587,6 @@ def is_json_serializable(obj):
        return False
 def get_raw_input(description, default=False):
    """Get user input from the command line via raw_input / input.
    description (unicode): Text to display before prompt.
    default (unicode or False/None): Default value to display with prompt.
    RETURNS (unicode): User input.
    """
    additional = " (default: %s)" % default if default else ""
    prompt = "    %s%s: " % (description, additional)
    user_input = input_(prompt)
    return user_input
 def to_bytes(getters, exclude):
    serialized = OrderedDict()
    for key, getter in getters.items():
@ -634,27 +621,6 @@ def from_disk(path, readers, exclude):
    return path
 def print_markdown(data, title=None):
    """Print data in GitHub-flavoured Markdown format for issues etc.
    data (dict or list of tuples): Label/value pairs.
    title (unicode or None): Title, will be rendered as headline 2.
    """
    def excl_value(value):
        # contains path, i.e. personal info
        return isinstance(value, basestring_) and Path(value).exists()
    if isinstance(data, dict):
        data = list(data.items())
    markdown = [
        "* **{}:** {}".format(l, unicode_(v)) for l, v in data if not excl_value(v)
    ]
    if title:
        print("\n## {}".format(title))
    print("\n{}\n".format("\n".join(markdown)))
 def minify_html(html):
    """Perform a template-specific, rudimentary HTML minification for displaCy.
    Disclaimer: NOT a general-purpose solution, only removes indentation and