Merge branch 'develop' into spacy.io

2025-09-18 01:52:37 +03:00 · 2019-02-24 12:08:15 +01:00 · 2019-02-24 12:08:15 +01:00 · f34d6281d6
commit f34d6281d6
parent ab8392eda3 235a0e948e
14 changed files with 102 additions and 102 deletions
--- a/.appveyor.yml
+++ b/.appveyor.yml
@ -1,51 +1,21 @@
 environment:
-
  matrix:
-
-    # For Python versions available on Appveyor, see
-    # http://www.appveyor.com/docs/installed-software#python
-
-      #- PYTHON: "C:\\Python27-x64"
-    #- PYTHON: "C:\\Python34"
-    #- PYTHON: "C:\\Python35"
-    #- DISTUTILS_USE_SDK: "1"
-    #- PYTHON: "C:\\Python34-x64"
-    #- DISTUTILS_USE_SDK: "1"
    - PYTHON: "C:\\Python35-x64"
    - PYTHON: "C:\\Python36-x64"
    - PYTHON: "C:\\Python37-x64"
-
 install:
  # We need wheel installed to build wheels
  - "%PYTHON%\\python.exe -m pip install wheel"
  - "%PYTHON%\\python.exe -m pip install cython"
  - "%PYTHON%\\python.exe -m pip install -r requirements.txt"
  - "%PYTHON%\\python.exe -m pip install -e ."
-
 build: off
-
 test_script:
-  # Put your test command here.
-  # If you don't need to build C extensions on 64-bit Python 3.4,
-  # you can remove "build.cmd" from the front of the command, as it's
-  # only needed to support those cases.
-  # Note that you must use the environment variable %PYTHON% to refer to
-  # the interpreter you're using - Appveyor does not do anything special
-  # to put the Python version you want to use on PATH.
  - "%PYTHON%\\python.exe -m pytest spacy/ --no-print-logs"
-
 after_test:
-  # This step builds your wheels.
-  # Again, you only need build.cmd if you're building C extensions for
-  # 64-bit Python 3.4. And you need to use %PYTHON% to get the correct
-  # interpreter
  - "%PYTHON%\\python.exe setup.py bdist_wheel"
-
 artifacts:
-  # bdist_wheel puts your built wheel in the dist directory
  - path: dist\*
-
-#on_success:
-#  You can use this step to upload your artifacts to a public website.
-#  See Appveyor's documentation for more details. Or you can simply
-#  access your wheels from the Appveyor "artifacts" tab for your build.
+branches:
+  except:
+    - spacy.io
--- a/.travis.yml
+++ b/.travis.yml
@ -1,26 +1,20 @@
 language: python
-
 sudo: false
+cache: pip
 dist: trusty
 group: edge
-
 python:
   - "2.7"
   - "3.5"
   - "3.6"
-
 os:
  - linux
-
 env:
  - VIA=compile
  - VIA=flake8
-  #- VIA=pypi_nightly
-
 install:
  - "./travis.sh"
  - pip install flake8
-
 script:
  - "cat /proc/cpuinfo | grep flags | head -n 1"
  - "pip install pytest pytest-timeout"
@ -28,10 +22,10 @@ script:
  - if [[ "${VIA}" == "flake8" ]]; then flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics; fi
  - if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
  - if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
-
+branches:
+  except:
+    - spacy.io
 notifications:
  slack:
    secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ=
  email: false
-
-cache: pip
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@ -41,7 +41,9 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
-        textcat = nlp.create_pipe("textcat")
+        textcat = nlp.create_pipe("textcat", config={
+            "architecture": "simple_cnn",
+            "exclusive_classes": True})
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
@ -70,7 +72,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
-            batches = minibatch(train_data, size=compounding(4.0, 16.0, 1.001))
+            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
@ -138,6 +140,9 @@ def evaluate(tokenizer, textcat, texts, cats):
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
+    if (precision+recall) == 0:
+        f_score = 0.0
+    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=2.0.1,<2.1.0
-thinc>=7.0.1,<7.1.0
+thinc>=7.0.2,<7.1.0
 blis>=0.2.2,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.0.12,<1.1.0
--- a/setup.py
+++ b/setup.py
@ -227,7 +227,7 @@ def setup_package():
                "murmurhash>=0.28.0,<1.1.0",
                "cymem>=2.0.2,<2.1.0",
                "preshed>=2.0.1,<2.1.0",
-                "thinc>=7.0.1,<7.1.0",
+                "thinc>=7.0.2,<7.1.0",
                "blis>=0.2.2,<0.3.0",
                "plac<1.0.0,>=0.9.6",
                "requests>=2.13.0,<3.0.0",
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -72,10 +72,10 @@ def _flatten_add_lengths(seqs, pad=0, drop=0.0):


 def _zero_init(model):
-    def _zero_init_impl(self, X, y):
+    def _zero_init_impl(self, *args, **kwargs):
        self.W.fill(0)

-    model.on_data_hooks.append(_zero_init_impl)
+    model.on_init_hooks.append(_zero_init_impl)
    if model.W is not None:
        model.W.fill(0.0)
    return model
@ -564,18 +564,26 @@ def build_text_classifier(nr_class, width=64, **cfg):
        )

        linear_model = _preprocess_doc >> LinearModel(nr_class)
-        model = (
-            (linear_model | cnn_model)
-            >> zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0))
+        if cfg.get('exclusive_classes'):
+            output_layer = Softmax(nr_class, nr_class * 2)
+        else:
+            output_layer = (
+                zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0))
                >> logistic
            )
-        model.tok2vec = tok2vec
+
+
+        model = (
+            (linear_model | cnn_model)
+            >> output_layer
+        )
+        model.tok2vec = chain(tok2vec, flatten)
    model.nO = nr_class
    model.lsuv = False
    return model


-def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=True, **cfg):
+def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg):
    """
    Build a simple CNN text classifier, given a token-to-vector model as inputs.
    If exclusive_classes=True, a softmax non-linearity is applied, so that the
@ -586,7 +594,7 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=True,
        if exclusive_classes:
            output_layer = Softmax(nr_class, tok2vec.nO)
        else:
-            output_layer = zero_init(Affine(nr_class, tok2vec.nO)) >> logistic
+            output_layer = zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic
        model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer
    model.tok2vec = chain(tok2vec, flatten)
    model.nO = nr_class
--- a/spacy/about.py
+++ b/spacy/about.py
@ -4,13 +4,13 @@
 # fmt: off

 __title__ = "spacy-nightly"
-__version__ = "2.1.0a8"
+__version__ = "2.1.0a9.dev1"
 __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
 __uri__ = "https://spacy.io"
 __author__ = "Explosion AI"
 __email__ = "contact@explosion.ai"
 __license__ = "MIT"
-__release__ = True
+__release__ = False

 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -253,10 +253,10 @@ class EntityRenderer(object):
            label = span["label"]
            start = span["start"]
            end = span["end"]
-            entity = text[start:end]
+            entity = escape_html(text[start:end])
            fragments = text[offset:start].split("\n")
            for i, fragment in enumerate(fragments):
-                markup += fragment
+                markup += escape_html(fragment)
                if len(fragments) > 1 and i != len(fragments) - 1:
                    markup += "</br>"
            if self.ents is None or label.upper() in self.ents:
@ -265,7 +265,7 @@ class EntityRenderer(object):
            else:
                markup += entity
            offset = end
-        markup += text[offset:]
+        markup += escape_html(text[offset:])
        markup = TPL_ENTS.format(content=markup, colors=self.colors)
        if title:
            markup = TPL_TITLE.format(title=title) + markup
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -24,7 +24,8 @@ from ..vocab cimport Vocab
 from ..syntax import nonproj
 from ..attrs import POS, ID
 from ..parts_of_speech import X
-from .._ml import Tok2Vec, build_tagger_model, build_simple_cnn_text_classifier
+from .._ml import Tok2Vec, build_tagger_model
+from .._ml import build_text_classifier, build_simple_cnn_text_classifier
 from .._ml import link_vectors_to_models, zero_init, flatten
 from .._ml import masked_language_model, create_default_optimizer
 from ..errors import Errors, TempErrors
@ -862,8 +863,11 @@ class TextCategorizer(Pipe):
            token_vector_width = cfg["token_vector_width"]
        else:
            token_vector_width = util.env_opt("token_vector_width", 96)
+        if cfg.get('architecture') == 'simple_cnn':
            tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
            return build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
+        else:
+            return build_text_classifier(nr_class, **cfg)

    @property
    def tok2vec(self):
@ -942,7 +946,7 @@ class TextCategorizer(Pipe):
        not_missing = self.model.ops.asarray(not_missing)
        d_scores = (scores-truths) / scores.shape[0]
        d_scores *= not_missing
-        mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
+        mean_square_error = (d_scores**2).sum(axis=1).mean()
        return float(mean_square_error), d_scores

    def add_label(self, label):
@ -964,11 +968,6 @@ class TextCategorizer(Pipe):

    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
                       **kwargs):
-        if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
-            token_vector_width = pipeline[0].model.nO
-        else:
-            token_vector_width = 64
-
        if self.model is True:
            self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
            self.model = self.Model(len(self.labels), **self.cfg)
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@ -204,6 +204,8 @@ class ParserModel(Model):
        if new_output == self.upper.nO:
            return
        smaller = self.upper
+
+        with Model.use_device('cpu'):
            larger = Affine(new_output, smaller.nI)
        # Set nan as value for unseen classes, to prevent prediction.
        larger.W.fill(self.ops.xp.nan)
--- a/spacy/tests/regression/test_issue2728.py
+++ b/spacy/tests/regression/test_issue2728.py
@ -0,0 +1,16 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from spacy import displacy
+from spacy.tokens import Doc, Span
+
+
+def test_issue2728(en_vocab):
+    """Test that displaCy ENT visualizer escapes HTML correctly."""
+    doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
+    doc.ents = [Span(doc, 0, 1, label="TEST")]
+    html = displacy.render(doc, style="ent")
+    assert "&lt;RELEASE&gt;" in html
+    doc.ents = [Span(doc, 1, 2, label="TEST")]
+    html = displacy.render(doc, style="ent")
+    assert "&lt;RELEASE&gt;" in html
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@ -107,8 +107,8 @@ details and examples.
 >
 > ```python
 > from spacy.attrs import ORTH, LEMMA
-> case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}]
-> tokenizer.add_special_case(case)
+> case = [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]
+> tokenizer.add_special_case("don't", case)
 > ```

 | Name          | Type     | Description                                                                                                                                                              |
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@ -8,7 +8,7 @@ menu:
  - ['Changelog', 'changelog']
 ---

-spaCy is compatible with **64-bit CPython 2.6+/3.3+** and runs on
+spaCy is compatible with **64-bit CPython 2.7+/3.4+** and runs on
 **Unix/Linux**, **macOS/OS X** and **Windows**. The latest spaCy releases are
 available over [pip](https://pypi.python.org/pypi/spacy) and
 [conda](https://anaconda.org/conda-forge/spacy).
--- a/website/docs/usage/v2-1.md
+++ b/website/docs/usage/v2-1.md
@ -10,11 +10,11 @@ menu:

 spaCy v2.1 has focussed primarily on stability and performance, solidifying the
 design changes introduced in [v2.0](/usage/v2). As well as smaller models,
-faster runtime, and many bug-fixes, v2.1 also introduces experimental support
+faster runtime, and many bug fixes, v2.1 also introduces experimental support
 for some exciting new NLP innovations. For the full changelog, see the
 [release notes on GitHub](https://github.com/explosion/spaCy/releases/tag/v2.1.0).

-### BERT/ULMFit/Elmo-style pre-training
+### BERT/ULMFit/Elmo-style pre-training {tag="experimental"}

 > #### Example
 >
@ -115,33 +115,6 @@ or `POS` for finding sequences of the same part-of-speech tags.

 </Infobox>

-### Components and languages via entry points
-
-> #### Example
->
-> ```python
-> from setuptools import setup
-> setup(
->     name="custom_extension_package",
->     entry_points={
->         "spacy_factories": ["your_component = component:ComponentFactory"]
->         "spacy_languages": ["xyz = language:XYZLanguage"]
->    }
-> )
-> ```
-
-Using entry points, model packages and extension packages can now define their
-own `"spacy_factories"` and `"spacy_languages"`, which will be added to the
-built-in factories and languages. If a package in the same environment exposes
-spaCy entry points, all of this happens automatically and no further user action
-is required.
-
-<Infobox>
-
-**Usage:** [Using entry points](/usage/saving-loading#entry-points)
-
-</Infobox>
-
 ### Retokenizer for merging and splitting

 > #### Example
@ -169,6 +142,33 @@ deprecated.

 </Infobox>

+### Components and languages via entry points
+
+> #### Example
+>
+> ```python
+> from setuptools import setup
+> setup(
+>     name="custom_extension_package",
+>     entry_points={
+>         "spacy_factories": ["your_component = component:ComponentFactory"]
+>         "spacy_languages": ["xyz = language:XYZLanguage"]
+>    }
+> )
+> ```
+
+Using entry points, model packages and extension packages can now define their
+own `"spacy_factories"` and `"spacy_languages"`, which will be added to the
+built-in factories and languages. If a package in the same environment exposes
+spaCy entry points, all of this happens automatically and no further user action
+is required.
+
+<Infobox>
+
+**Usage:** [Using entry points](/usage/saving-loading#entry-points)
+
+</Infobox>
+
 ### Improved documentation

 Although it looks pretty much the same, we've rebuilt the entire documentation
@ -210,6 +210,12 @@ if all of your models are up to date, you can run the

 </Infobox>

+- Due to difficulties linking our new
+  [`blis`](https://github.com/explosion/cython-blis) for faster
+  platform-independent matrix multiplication, this nightly release currently
+  **doesn't work on Python 2.7 on Windows**. We expect this to be corrected in
+  the future.
+
 - While the [`Matcher`](/api/matcher) API is fully backwards compatible, its
  algorithm has changed to fix a number of bugs and performance issues. This
  means that the `Matcher` in v2.1.x may produce different results compared to