Merge branch 'develop' into spacy.io

This commit is contained in:
Ines Montani 2019-02-24 12:08:15 +01:00
commit f34d6281d6
14 changed files with 102 additions and 102 deletions

View File

@ -1,51 +1,21 @@
environment:
matrix:
# For Python versions available on Appveyor, see
# http://www.appveyor.com/docs/installed-software#python
#- PYTHON: "C:\\Python27-x64"
#- PYTHON: "C:\\Python34"
#- PYTHON: "C:\\Python35"
#- DISTUTILS_USE_SDK: "1"
#- PYTHON: "C:\\Python34-x64"
#- DISTUTILS_USE_SDK: "1"
- PYTHON: "C:\\Python35-x64"
- PYTHON: "C:\\Python36-x64"
- PYTHON: "C:\\Python37-x64"
install:
# We need wheel installed to build wheels
- "%PYTHON%\\python.exe -m pip install wheel"
- "%PYTHON%\\python.exe -m pip install cython"
- "%PYTHON%\\python.exe -m pip install -r requirements.txt"
- "%PYTHON%\\python.exe -m pip install -e ."
build: off
test_script:
# Put your test command here.
# If you don't need to build C extensions on 64-bit Python 3.4,
# you can remove "build.cmd" from the front of the command, as it's
# only needed to support those cases.
# Note that you must use the environment variable %PYTHON% to refer to
# the interpreter you're using - Appveyor does not do anything special
# to put the Python version you want to use on PATH.
- "%PYTHON%\\python.exe -m pytest spacy/ --no-print-logs"
after_test:
# This step builds your wheels.
# Again, you only need build.cmd if you're building C extensions for
# 64-bit Python 3.4. And you need to use %PYTHON% to get the correct
# interpreter
- "%PYTHON%\\python.exe setup.py bdist_wheel"
artifacts:
# bdist_wheel puts your built wheel in the dist directory
- path: dist\*
#on_success:
# You can use this step to upload your artifacts to a public website.
# See Appveyor's documentation for more details. Or you can simply
# access your wheels from the Appveyor "artifacts" tab for your build.
branches:
except:
- spacy.io

View File

@ -1,26 +1,20 @@
language: python
sudo: false
cache: pip
dist: trusty
group: edge
python:
- "2.7"
- "3.5"
- "3.6"
os:
- linux
env:
- VIA=compile
- VIA=flake8
#- VIA=pypi_nightly
install:
- "./travis.sh"
- pip install flake8
script:
- "cat /proc/cpuinfo | grep flags | head -n 1"
- "pip install pytest pytest-timeout"
@ -28,10 +22,10 @@ script:
- if [[ "${VIA}" == "flake8" ]]; then flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics; fi
- if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
- if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
branches:
except:
- spacy.io
notifications:
slack:
secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ=
email: false
cache: pip

View File

@ -41,7 +41,9 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if "textcat" not in nlp.pipe_names:
textcat = nlp.create_pipe("textcat")
textcat = nlp.create_pipe("textcat", config={
"architecture": "simple_cnn",
"exclusive_classes": True})
nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
@ -70,7 +72,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
for i in range(n_iter):
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(train_data, size=compounding(4.0, 16.0, 1.001))
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
@ -138,6 +140,9 @@ def evaluate(tokenizer, textcat, texts, cats):
fn += 1
precision = tp / (tp + fp)
recall = tp / (tp + fn)
if (precision+recall) == 0:
f_score = 0.0
else:
f_score = 2 * (precision * recall) / (precision + recall)
return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

View File

@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=2.0.1,<2.1.0
thinc>=7.0.1,<7.1.0
thinc>=7.0.2,<7.1.0
blis>=0.2.2,<0.3.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.0.12,<1.1.0

View File

@ -227,7 +227,7 @@ def setup_package():
"murmurhash>=0.28.0,<1.1.0",
"cymem>=2.0.2,<2.1.0",
"preshed>=2.0.1,<2.1.0",
"thinc>=7.0.1,<7.1.0",
"thinc>=7.0.2,<7.1.0",
"blis>=0.2.2,<0.3.0",
"plac<1.0.0,>=0.9.6",
"requests>=2.13.0,<3.0.0",

View File

@ -72,10 +72,10 @@ def _flatten_add_lengths(seqs, pad=0, drop=0.0):
def _zero_init(model):
def _zero_init_impl(self, X, y):
def _zero_init_impl(self, *args, **kwargs):
self.W.fill(0)
model.on_data_hooks.append(_zero_init_impl)
model.on_init_hooks.append(_zero_init_impl)
if model.W is not None:
model.W.fill(0.0)
return model
@ -564,18 +564,26 @@ def build_text_classifier(nr_class, width=64, **cfg):
)
linear_model = _preprocess_doc >> LinearModel(nr_class)
model = (
(linear_model | cnn_model)
>> zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0))
if cfg.get('exclusive_classes'):
output_layer = Softmax(nr_class, nr_class * 2)
else:
output_layer = (
zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0))
>> logistic
)
model.tok2vec = tok2vec
model = (
(linear_model | cnn_model)
>> output_layer
)
model.tok2vec = chain(tok2vec, flatten)
model.nO = nr_class
model.lsuv = False
return model
def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=True, **cfg):
def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg):
"""
Build a simple CNN text classifier, given a token-to-vector model as inputs.
If exclusive_classes=True, a softmax non-linearity is applied, so that the
@ -586,7 +594,7 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=True,
if exclusive_classes:
output_layer = Softmax(nr_class, tok2vec.nO)
else:
output_layer = zero_init(Affine(nr_class, tok2vec.nO)) >> logistic
output_layer = zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic
model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer
model.tok2vec = chain(tok2vec, flatten)
model.nO = nr_class

View File

@ -4,13 +4,13 @@
# fmt: off
__title__ = "spacy-nightly"
__version__ = "2.1.0a8"
__version__ = "2.1.0a9.dev1"
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
__uri__ = "https://spacy.io"
__author__ = "Explosion AI"
__email__ = "contact@explosion.ai"
__license__ = "MIT"
__release__ = True
__release__ = False
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -253,10 +253,10 @@ class EntityRenderer(object):
label = span["label"]
start = span["start"]
end = span["end"]
entity = text[start:end]
entity = escape_html(text[start:end])
fragments = text[offset:start].split("\n")
for i, fragment in enumerate(fragments):
markup += fragment
markup += escape_html(fragment)
if len(fragments) > 1 and i != len(fragments) - 1:
markup += "</br>"
if self.ents is None or label.upper() in self.ents:
@ -265,7 +265,7 @@ class EntityRenderer(object):
else:
markup += entity
offset = end
markup += text[offset:]
markup += escape_html(text[offset:])
markup = TPL_ENTS.format(content=markup, colors=self.colors)
if title:
markup = TPL_TITLE.format(title=title) + markup

View File

@ -24,7 +24,8 @@ from ..vocab cimport Vocab
from ..syntax import nonproj
from ..attrs import POS, ID
from ..parts_of_speech import X
from .._ml import Tok2Vec, build_tagger_model, build_simple_cnn_text_classifier
from .._ml import Tok2Vec, build_tagger_model
from .._ml import build_text_classifier, build_simple_cnn_text_classifier
from .._ml import link_vectors_to_models, zero_init, flatten
from .._ml import masked_language_model, create_default_optimizer
from ..errors import Errors, TempErrors
@ -862,8 +863,11 @@ class TextCategorizer(Pipe):
token_vector_width = cfg["token_vector_width"]
else:
token_vector_width = util.env_opt("token_vector_width", 96)
if cfg.get('architecture') == 'simple_cnn':
tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
return build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
else:
return build_text_classifier(nr_class, **cfg)
@property
def tok2vec(self):
@ -942,7 +946,7 @@ class TextCategorizer(Pipe):
not_missing = self.model.ops.asarray(not_missing)
d_scores = (scores-truths) / scores.shape[0]
d_scores *= not_missing
mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
mean_square_error = (d_scores**2).sum(axis=1).mean()
return float(mean_square_error), d_scores
def add_label(self, label):
@ -964,11 +968,6 @@ class TextCategorizer(Pipe):
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
**kwargs):
if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
token_vector_width = pipeline[0].model.nO
else:
token_vector_width = 64
if self.model is True:
self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
self.model = self.Model(len(self.labels), **self.cfg)

View File

@ -204,6 +204,8 @@ class ParserModel(Model):
if new_output == self.upper.nO:
return
smaller = self.upper
with Model.use_device('cpu'):
larger = Affine(new_output, smaller.nI)
# Set nan as value for unseen classes, to prevent prediction.
larger.W.fill(self.ops.xp.nan)

View File

@ -0,0 +1,16 @@
# coding: utf8
from __future__ import unicode_literals
from spacy import displacy
from spacy.tokens import Doc, Span
def test_issue2728(en_vocab):
"""Test that displaCy ENT visualizer escapes HTML correctly."""
doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
doc.ents = [Span(doc, 0, 1, label="TEST")]
html = displacy.render(doc, style="ent")
assert "&lt;RELEASE&gt;" in html
doc.ents = [Span(doc, 1, 2, label="TEST")]
html = displacy.render(doc, style="ent")
assert "&lt;RELEASE&gt;" in html

View File

@ -107,8 +107,8 @@ details and examples.
>
> ```python
> from spacy.attrs import ORTH, LEMMA
> case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}]
> tokenizer.add_special_case(case)
> case = [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]
> tokenizer.add_special_case("don't", case)
> ```
| Name | Type | Description |

View File

@ -8,7 +8,7 @@ menu:
- ['Changelog', 'changelog']
---
spaCy is compatible with **64-bit CPython 2.6+/3.3+** and runs on
spaCy is compatible with **64-bit CPython 2.7+/3.4+** and runs on
**Unix/Linux**, **macOS/OS X** and **Windows**. The latest spaCy releases are
available over [pip](https://pypi.python.org/pypi/spacy) and
[conda](https://anaconda.org/conda-forge/spacy).

View File

@ -10,11 +10,11 @@ menu:
spaCy v2.1 has focussed primarily on stability and performance, solidifying the
design changes introduced in [v2.0](/usage/v2). As well as smaller models,
faster runtime, and many bug-fixes, v2.1 also introduces experimental support
faster runtime, and many bug fixes, v2.1 also introduces experimental support
for some exciting new NLP innovations. For the full changelog, see the
[release notes on GitHub](https://github.com/explosion/spaCy/releases/tag/v2.1.0).
### BERT/ULMFit/Elmo-style pre-training
### BERT/ULMFit/Elmo-style pre-training {tag="experimental"}
> #### Example
>
@ -115,33 +115,6 @@ or `POS` for finding sequences of the same part-of-speech tags.
</Infobox>
### Components and languages via entry points
> #### Example
>
> ```python
> from setuptools import setup
> setup(
> name="custom_extension_package",
> entry_points={
> "spacy_factories": ["your_component = component:ComponentFactory"]
> "spacy_languages": ["xyz = language:XYZLanguage"]
> }
> )
> ```
Using entry points, model packages and extension packages can now define their
own `"spacy_factories"` and `"spacy_languages"`, which will be added to the
built-in factories and languages. If a package in the same environment exposes
spaCy entry points, all of this happens automatically and no further user action
is required.
<Infobox>
**Usage:** [Using entry points](/usage/saving-loading#entry-points)
</Infobox>
### Retokenizer for merging and splitting
> #### Example
@ -169,6 +142,33 @@ deprecated.
</Infobox>
### Components and languages via entry points
> #### Example
>
> ```python
> from setuptools import setup
> setup(
> name="custom_extension_package",
> entry_points={
> "spacy_factories": ["your_component = component:ComponentFactory"]
> "spacy_languages": ["xyz = language:XYZLanguage"]
> }
> )
> ```
Using entry points, model packages and extension packages can now define their
own `"spacy_factories"` and `"spacy_languages"`, which will be added to the
built-in factories and languages. If a package in the same environment exposes
spaCy entry points, all of this happens automatically and no further user action
is required.
<Infobox>
**Usage:** [Using entry points](/usage/saving-loading#entry-points)
</Infobox>
### Improved documentation
Although it looks pretty much the same, we've rebuilt the entire documentation
@ -210,6 +210,12 @@ if all of your models are up to date, you can run the
</Infobox>
- Due to difficulties linking our new
[`blis`](https://github.com/explosion/cython-blis) for faster
platform-independent matrix multiplication, this nightly release currently
**doesn't work on Python 2.7 on Windows**. We expect this to be corrected in
the future.
- While the [`Matcher`](/api/matcher) API is fully backwards compatible, its
algorithm has changed to fix a number of bugs and performance issues. This
means that the `Matcher` in v2.1.x may produce different results compared to