mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge branch 'develop' into spacy.io
This commit is contained in:
commit
f34d6281d6
|
@ -1,51 +1,21 @@
|
|||
environment:
|
||||
|
||||
matrix:
|
||||
|
||||
# For Python versions available on Appveyor, see
|
||||
# http://www.appveyor.com/docs/installed-software#python
|
||||
|
||||
#- PYTHON: "C:\\Python27-x64"
|
||||
#- PYTHON: "C:\\Python34"
|
||||
#- PYTHON: "C:\\Python35"
|
||||
#- DISTUTILS_USE_SDK: "1"
|
||||
#- PYTHON: "C:\\Python34-x64"
|
||||
#- DISTUTILS_USE_SDK: "1"
|
||||
- PYTHON: "C:\\Python35-x64"
|
||||
- PYTHON: "C:\\Python36-x64"
|
||||
- PYTHON: "C:\\Python37-x64"
|
||||
|
||||
install:
|
||||
# We need wheel installed to build wheels
|
||||
- "%PYTHON%\\python.exe -m pip install wheel"
|
||||
- "%PYTHON%\\python.exe -m pip install cython"
|
||||
- "%PYTHON%\\python.exe -m pip install -r requirements.txt"
|
||||
- "%PYTHON%\\python.exe -m pip install -e ."
|
||||
|
||||
build: off
|
||||
|
||||
test_script:
|
||||
# Put your test command here.
|
||||
# If you don't need to build C extensions on 64-bit Python 3.4,
|
||||
# you can remove "build.cmd" from the front of the command, as it's
|
||||
# only needed to support those cases.
|
||||
# Note that you must use the environment variable %PYTHON% to refer to
|
||||
# the interpreter you're using - Appveyor does not do anything special
|
||||
# to put the Python version you want to use on PATH.
|
||||
- "%PYTHON%\\python.exe -m pytest spacy/ --no-print-logs"
|
||||
|
||||
after_test:
|
||||
# This step builds your wheels.
|
||||
# Again, you only need build.cmd if you're building C extensions for
|
||||
# 64-bit Python 3.4. And you need to use %PYTHON% to get the correct
|
||||
# interpreter
|
||||
- "%PYTHON%\\python.exe setup.py bdist_wheel"
|
||||
|
||||
artifacts:
|
||||
# bdist_wheel puts your built wheel in the dist directory
|
||||
- path: dist\*
|
||||
|
||||
#on_success:
|
||||
# You can use this step to upload your artifacts to a public website.
|
||||
# See Appveyor's documentation for more details. Or you can simply
|
||||
# access your wheels from the Appveyor "artifacts" tab for your build.
|
||||
branches:
|
||||
except:
|
||||
- spacy.io
|
||||
|
|
14
.travis.yml
14
.travis.yml
|
@ -1,26 +1,20 @@
|
|||
language: python
|
||||
|
||||
sudo: false
|
||||
cache: pip
|
||||
dist: trusty
|
||||
group: edge
|
||||
|
||||
python:
|
||||
- "2.7"
|
||||
- "3.5"
|
||||
- "3.6"
|
||||
|
||||
os:
|
||||
- linux
|
||||
|
||||
env:
|
||||
- VIA=compile
|
||||
- VIA=flake8
|
||||
#- VIA=pypi_nightly
|
||||
|
||||
install:
|
||||
- "./travis.sh"
|
||||
- pip install flake8
|
||||
|
||||
script:
|
||||
- "cat /proc/cpuinfo | grep flags | head -n 1"
|
||||
- "pip install pytest pytest-timeout"
|
||||
|
@ -28,10 +22,10 @@ script:
|
|||
- if [[ "${VIA}" == "flake8" ]]; then flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics; fi
|
||||
- if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
|
||||
- if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
|
||||
|
||||
branches:
|
||||
except:
|
||||
- spacy.io
|
||||
notifications:
|
||||
slack:
|
||||
secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ=
|
||||
email: false
|
||||
|
||||
cache: pip
|
||||
|
|
|
@ -41,7 +41,9 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
|
|||
# add the text classifier to the pipeline if it doesn't exist
|
||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||
if "textcat" not in nlp.pipe_names:
|
||||
textcat = nlp.create_pipe("textcat")
|
||||
textcat = nlp.create_pipe("textcat", config={
|
||||
"architecture": "simple_cnn",
|
||||
"exclusive_classes": True})
|
||||
nlp.add_pipe(textcat, last=True)
|
||||
# otherwise, get it, so we can add labels to it
|
||||
else:
|
||||
|
@ -70,7 +72,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
|
|||
for i in range(n_iter):
|
||||
losses = {}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(train_data, size=compounding(4.0, 16.0, 1.001))
|
||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
|
||||
|
@ -138,7 +140,10 @@ def evaluate(tokenizer, textcat, texts, cats):
|
|||
fn += 1
|
||||
precision = tp / (tp + fp)
|
||||
recall = tp / (tp + fn)
|
||||
f_score = 2 * (precision * recall) / (precision + recall)
|
||||
if (precision+recall) == 0:
|
||||
f_score = 0.0
|
||||
else:
|
||||
f_score = 2 * (precision * recall) / (precision + recall)
|
||||
return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=2.0.1,<2.1.0
|
||||
thinc>=7.0.1,<7.1.0
|
||||
thinc>=7.0.2,<7.1.0
|
||||
blis>=0.2.2,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.0.12,<1.1.0
|
||||
|
|
2
setup.py
2
setup.py
|
@ -227,7 +227,7 @@ def setup_package():
|
|||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=2.0.1,<2.1.0",
|
||||
"thinc>=7.0.1,<7.1.0",
|
||||
"thinc>=7.0.2,<7.1.0",
|
||||
"blis>=0.2.2,<0.3.0",
|
||||
"plac<1.0.0,>=0.9.6",
|
||||
"requests>=2.13.0,<3.0.0",
|
||||
|
|
22
spacy/_ml.py
22
spacy/_ml.py
|
@ -72,10 +72,10 @@ def _flatten_add_lengths(seqs, pad=0, drop=0.0):
|
|||
|
||||
|
||||
def _zero_init(model):
|
||||
def _zero_init_impl(self, X, y):
|
||||
def _zero_init_impl(self, *args, **kwargs):
|
||||
self.W.fill(0)
|
||||
|
||||
model.on_data_hooks.append(_zero_init_impl)
|
||||
model.on_init_hooks.append(_zero_init_impl)
|
||||
if model.W is not None:
|
||||
model.W.fill(0.0)
|
||||
return model
|
||||
|
@ -564,18 +564,26 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
|||
)
|
||||
|
||||
linear_model = _preprocess_doc >> LinearModel(nr_class)
|
||||
if cfg.get('exclusive_classes'):
|
||||
output_layer = Softmax(nr_class, nr_class * 2)
|
||||
else:
|
||||
output_layer = (
|
||||
zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0))
|
||||
>> logistic
|
||||
)
|
||||
|
||||
|
||||
model = (
|
||||
(linear_model | cnn_model)
|
||||
>> zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0))
|
||||
>> logistic
|
||||
>> output_layer
|
||||
)
|
||||
model.tok2vec = tok2vec
|
||||
model.tok2vec = chain(tok2vec, flatten)
|
||||
model.nO = nr_class
|
||||
model.lsuv = False
|
||||
return model
|
||||
|
||||
|
||||
def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=True, **cfg):
|
||||
def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg):
|
||||
"""
|
||||
Build a simple CNN text classifier, given a token-to-vector model as inputs.
|
||||
If exclusive_classes=True, a softmax non-linearity is applied, so that the
|
||||
|
@ -586,7 +594,7 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=True,
|
|||
if exclusive_classes:
|
||||
output_layer = Softmax(nr_class, tok2vec.nO)
|
||||
else:
|
||||
output_layer = zero_init(Affine(nr_class, tok2vec.nO)) >> logistic
|
||||
output_layer = zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic
|
||||
model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer
|
||||
model.tok2vec = chain(tok2vec, flatten)
|
||||
model.nO = nr_class
|
||||
|
|
|
@ -4,13 +4,13 @@
|
|||
# fmt: off
|
||||
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "2.1.0a8"
|
||||
__version__ = "2.1.0a9.dev1"
|
||||
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
|
||||
__uri__ = "https://spacy.io"
|
||||
__author__ = "Explosion AI"
|
||||
__email__ = "contact@explosion.ai"
|
||||
__license__ = "MIT"
|
||||
__release__ = True
|
||||
__release__ = False
|
||||
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -253,10 +253,10 @@ class EntityRenderer(object):
|
|||
label = span["label"]
|
||||
start = span["start"]
|
||||
end = span["end"]
|
||||
entity = text[start:end]
|
||||
entity = escape_html(text[start:end])
|
||||
fragments = text[offset:start].split("\n")
|
||||
for i, fragment in enumerate(fragments):
|
||||
markup += fragment
|
||||
markup += escape_html(fragment)
|
||||
if len(fragments) > 1 and i != len(fragments) - 1:
|
||||
markup += "</br>"
|
||||
if self.ents is None or label.upper() in self.ents:
|
||||
|
@ -265,7 +265,7 @@ class EntityRenderer(object):
|
|||
else:
|
||||
markup += entity
|
||||
offset = end
|
||||
markup += text[offset:]
|
||||
markup += escape_html(text[offset:])
|
||||
markup = TPL_ENTS.format(content=markup, colors=self.colors)
|
||||
if title:
|
||||
markup = TPL_TITLE.format(title=title) + markup
|
||||
|
|
|
@ -24,7 +24,8 @@ from ..vocab cimport Vocab
|
|||
from ..syntax import nonproj
|
||||
from ..attrs import POS, ID
|
||||
from ..parts_of_speech import X
|
||||
from .._ml import Tok2Vec, build_tagger_model, build_simple_cnn_text_classifier
|
||||
from .._ml import Tok2Vec, build_tagger_model
|
||||
from .._ml import build_text_classifier, build_simple_cnn_text_classifier
|
||||
from .._ml import link_vectors_to_models, zero_init, flatten
|
||||
from .._ml import masked_language_model, create_default_optimizer
|
||||
from ..errors import Errors, TempErrors
|
||||
|
@ -862,8 +863,11 @@ class TextCategorizer(Pipe):
|
|||
token_vector_width = cfg["token_vector_width"]
|
||||
else:
|
||||
token_vector_width = util.env_opt("token_vector_width", 96)
|
||||
tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
|
||||
return build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
|
||||
if cfg.get('architecture') == 'simple_cnn':
|
||||
tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
|
||||
return build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
|
||||
else:
|
||||
return build_text_classifier(nr_class, **cfg)
|
||||
|
||||
@property
|
||||
def tok2vec(self):
|
||||
|
@ -942,7 +946,7 @@ class TextCategorizer(Pipe):
|
|||
not_missing = self.model.ops.asarray(not_missing)
|
||||
d_scores = (scores-truths) / scores.shape[0]
|
||||
d_scores *= not_missing
|
||||
mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
|
||||
mean_square_error = (d_scores**2).sum(axis=1).mean()
|
||||
return float(mean_square_error), d_scores
|
||||
|
||||
def add_label(self, label):
|
||||
|
@ -964,11 +968,6 @@ class TextCategorizer(Pipe):
|
|||
|
||||
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
|
||||
**kwargs):
|
||||
if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
|
||||
token_vector_width = pipeline[0].model.nO
|
||||
else:
|
||||
token_vector_width = 64
|
||||
|
||||
if self.model is True:
|
||||
self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
|
||||
self.model = self.Model(len(self.labels), **self.cfg)
|
||||
|
|
|
@ -204,7 +204,9 @@ class ParserModel(Model):
|
|||
if new_output == self.upper.nO:
|
||||
return
|
||||
smaller = self.upper
|
||||
larger = Affine(new_output, smaller.nI)
|
||||
|
||||
with Model.use_device('cpu'):
|
||||
larger = Affine(new_output, smaller.nI)
|
||||
# Set nan as value for unseen classes, to prevent prediction.
|
||||
larger.W.fill(self.ops.xp.nan)
|
||||
larger.b.fill(self.ops.xp.nan)
|
||||
|
|
16
spacy/tests/regression/test_issue2728.py
Normal file
16
spacy/tests/regression/test_issue2728.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy import displacy
|
||||
from spacy.tokens import Doc, Span
|
||||
|
||||
|
||||
def test_issue2728(en_vocab):
|
||||
"""Test that displaCy ENT visualizer escapes HTML correctly."""
|
||||
doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
|
||||
doc.ents = [Span(doc, 0, 1, label="TEST")]
|
||||
html = displacy.render(doc, style="ent")
|
||||
assert "<RELEASE>" in html
|
||||
doc.ents = [Span(doc, 1, 2, label="TEST")]
|
||||
html = displacy.render(doc, style="ent")
|
||||
assert "<RELEASE>" in html
|
|
@ -107,8 +107,8 @@ details and examples.
|
|||
>
|
||||
> ```python
|
||||
> from spacy.attrs import ORTH, LEMMA
|
||||
> case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}]
|
||||
> tokenizer.add_special_case(case)
|
||||
> case = [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]
|
||||
> tokenizer.add_special_case("don't", case)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
|
|
|
@ -8,7 +8,7 @@ menu:
|
|||
- ['Changelog', 'changelog']
|
||||
---
|
||||
|
||||
spaCy is compatible with **64-bit CPython 2.6+/3.3+** and runs on
|
||||
spaCy is compatible with **64-bit CPython 2.7+/3.4+** and runs on
|
||||
**Unix/Linux**, **macOS/OS X** and **Windows**. The latest spaCy releases are
|
||||
available over [pip](https://pypi.python.org/pypi/spacy) and
|
||||
[conda](https://anaconda.org/conda-forge/spacy).
|
||||
|
|
|
@ -10,11 +10,11 @@ menu:
|
|||
|
||||
spaCy v2.1 has focussed primarily on stability and performance, solidifying the
|
||||
design changes introduced in [v2.0](/usage/v2). As well as smaller models,
|
||||
faster runtime, and many bug-fixes, v2.1 also introduces experimental support
|
||||
faster runtime, and many bug fixes, v2.1 also introduces experimental support
|
||||
for some exciting new NLP innovations. For the full changelog, see the
|
||||
[release notes on GitHub](https://github.com/explosion/spaCy/releases/tag/v2.1.0).
|
||||
|
||||
### BERT/ULMFit/Elmo-style pre-training
|
||||
### BERT/ULMFit/Elmo-style pre-training {tag="experimental"}
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -115,33 +115,6 @@ or `POS` for finding sequences of the same part-of-speech tags.
|
|||
|
||||
</Infobox>
|
||||
|
||||
### Components and languages via entry points
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from setuptools import setup
|
||||
> setup(
|
||||
> name="custom_extension_package",
|
||||
> entry_points={
|
||||
> "spacy_factories": ["your_component = component:ComponentFactory"]
|
||||
> "spacy_languages": ["xyz = language:XYZLanguage"]
|
||||
> }
|
||||
> )
|
||||
> ```
|
||||
|
||||
Using entry points, model packages and extension packages can now define their
|
||||
own `"spacy_factories"` and `"spacy_languages"`, which will be added to the
|
||||
built-in factories and languages. If a package in the same environment exposes
|
||||
spaCy entry points, all of this happens automatically and no further user action
|
||||
is required.
|
||||
|
||||
<Infobox>
|
||||
|
||||
**Usage:** [Using entry points](/usage/saving-loading#entry-points)
|
||||
|
||||
</Infobox>
|
||||
|
||||
### Retokenizer for merging and splitting
|
||||
|
||||
> #### Example
|
||||
|
@ -169,6 +142,33 @@ deprecated.
|
|||
|
||||
</Infobox>
|
||||
|
||||
### Components and languages via entry points
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from setuptools import setup
|
||||
> setup(
|
||||
> name="custom_extension_package",
|
||||
> entry_points={
|
||||
> "spacy_factories": ["your_component = component:ComponentFactory"]
|
||||
> "spacy_languages": ["xyz = language:XYZLanguage"]
|
||||
> }
|
||||
> )
|
||||
> ```
|
||||
|
||||
Using entry points, model packages and extension packages can now define their
|
||||
own `"spacy_factories"` and `"spacy_languages"`, which will be added to the
|
||||
built-in factories and languages. If a package in the same environment exposes
|
||||
spaCy entry points, all of this happens automatically and no further user action
|
||||
is required.
|
||||
|
||||
<Infobox>
|
||||
|
||||
**Usage:** [Using entry points](/usage/saving-loading#entry-points)
|
||||
|
||||
</Infobox>
|
||||
|
||||
### Improved documentation
|
||||
|
||||
Although it looks pretty much the same, we've rebuilt the entire documentation
|
||||
|
@ -210,6 +210,12 @@ if all of your models are up to date, you can run the
|
|||
|
||||
</Infobox>
|
||||
|
||||
- Due to difficulties linking our new
|
||||
[`blis`](https://github.com/explosion/cython-blis) for faster
|
||||
platform-independent matrix multiplication, this nightly release currently
|
||||
**doesn't work on Python 2.7 on Windows**. We expect this to be corrected in
|
||||
the future.
|
||||
|
||||
- While the [`Matcher`](/api/matcher) API is fully backwards compatible, its
|
||||
algorithm has changed to fix a number of bugs and performance issues. This
|
||||
means that the `Matcher` in v2.1.x may produce different results compared to
|
||||
|
|
Loading…
Reference in New Issue
Block a user