2017-10-27 15:39:30 +03:00
|
|
|
# coding: utf8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
import numpy
|
|
|
|
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
|
2017-10-03 21:07:17 +03:00
|
|
|
from thinc.t2t import ExtractWindow, ParametricAttention
|
2018-12-10 16:37:39 +03:00
|
|
|
from thinc.t2v import Pooling, sum_pool, mean_pool
|
2019-10-25 23:28:20 +03:00
|
|
|
from thinc.i2v import HashEmbed
|
|
|
|
from thinc.misc import Residual, FeatureExtracter
|
2017-10-03 21:07:17 +03:00
|
|
|
from thinc.misc import LayerNorm as LN
|
2017-05-06 21:38:12 +03:00
|
|
|
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
2018-12-06 17:12:39 +03:00
|
|
|
from thinc.api import with_getitem, flatten_add_lengths
|
2017-10-27 15:39:30 +03:00
|
|
|
from thinc.api import uniqued, wrap, noop
|
2017-10-03 21:07:17 +03:00
|
|
|
from thinc.linear.linear import LinearModel
|
Update draft of parser neural network model
Model is good, but code is messy. Currently requires Chainer, which may cause the build to fail on machines without a GPU.
Outline of the model:
We first predict context-sensitive vectors for each word in the input:
(embed_lower | embed_prefix | embed_suffix | embed_shape)
>> Maxout(token_width)
>> convolution ** 4
This convolutional layer is shared between the tagger and the parser. This prevents the parser from needing tag features.
To boost the representation, we make a "super tag" with POS, morphology and dependency label. The tagger predicts this
by adding a softmax layer onto the convolutional layer --- so, we're teaching the convolutional layer to give us a
representation that's one affine transform from this informative lexical information. This is obviously good for the
parser (which backprops to the convolutions too).
The parser model makes a state vector by concatenating the vector representations for its context tokens. Current
results suggest few context tokens works well. Maybe this is a bug.
The current context tokens:
* S0, S1, S2: Top three words on the stack
* B0, B1: First two words of the buffer
* S0L1, S0L2: Leftmost and second leftmost children of S0
* S0R1, S0R2: Rightmost and second rightmost children of S0
* S1L1, S1L2, S1R2, S1R, B0L1, B0L2: Likewise for S1 and B0
This makes the state vector quite long: 13*T, where T is the token vector width (128 is working well). Fortunately,
there's a way to structure the computation to save some expense (and make it more GPU friendly).
The parser typically visits 2*N states for a sentence of length N (although it may visit more, if it back-tracks
with a non-monotonic transition). A naive implementation would require 2*N (B, 13*T) @ (13*T, H) matrix multiplications
for a batch of size B. We can instead perform one (B*N, T) @ (T, 13*H) multiplication, to pre-compute the hidden
weights for each positional feature wrt the words in the batch. (Note that our token vectors come from the CNN
-- so we can't play this trick over the vocabulary. That's how Stanford's NN parser works --- and why its model
is so big.)
This pre-computation strategy allows a nice compromise between GPU-friendliness and implementation simplicity.
The CNN and the wide lower layer are computed on the GPU, and then the precomputed hidden weights are moved
to the CPU, before we start the transition-based parsing process. This makes a lot of things much easier.
We don't have to worry about variable-length batch sizes, and we don't have to implement the dynamic oracle
in CUDA to train.
Currently the parser's loss function is multilabel log loss, as the dynamic oracle allows multiple states to
be 0 cost. This is defined as:
(exp(score) / Z) - (exp(score) / gZ)
Where gZ is the sum of the scores assigned to gold classes. I'm very interested in regressing on the cost directly,
but so far this isn't working well.
Machinery is in place for beam-search, which has been working well for the linear model. Beam search should benefit
greatly from the pre-computation trick.
2017-05-13 00:09:15 +03:00
|
|
|
from thinc.neural.ops import NumpyOps, CupyOps
|
2019-03-09 14:50:08 +03:00
|
|
|
from thinc.neural.util import get_array_module, copy_array
|
2017-11-06 16:25:37 +03:00
|
|
|
from thinc.neural.optimizers import Adam
|
2017-10-03 21:07:17 +03:00
|
|
|
|
2017-05-08 12:36:37 +03:00
|
|
|
from thinc import describe
|
|
|
|
from thinc.describe import Dimension, Synapses, Biases, Gradient
|
|
|
|
from thinc.neural._classes.affine import _set_dimensions_if_needed
|
2017-10-03 19:39:57 +03:00
|
|
|
import thinc.extra.load_nlp
|
2017-08-12 13:45:20 +03:00
|
|
|
|
2017-10-27 15:39:30 +03:00
|
|
|
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
|
2019-07-11 15:46:29 +03:00
|
|
|
from .errors import Errors, user_warning, Warnings
|
2017-08-18 22:55:23 +03:00
|
|
|
from . import util
|
2019-10-25 23:28:20 +03:00
|
|
|
from . import ml as new_ml
|
2019-10-31 17:01:15 +03:00
|
|
|
from .ml import _legacy_tok2vec
|
2017-05-04 14:31:40 +03:00
|
|
|
|
2017-10-03 21:29:58 +03:00
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
VECTORS_KEY = "spacy_pretrained_vectors"
|
2019-10-31 17:01:15 +03:00
|
|
|
# Backwards compatibility with <2.2.2
|
|
|
|
USE_MODEL_REGISTRY_TOK2VEC = False
|
2017-05-08 12:36:37 +03:00
|
|
|
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2017-10-31 04:00:26 +03:00
|
|
|
def cosine(vec1, vec2):
|
2017-10-31 13:40:46 +03:00
|
|
|
xp = get_array_module(vec1)
|
|
|
|
norm1 = xp.linalg.norm(vec1)
|
|
|
|
norm2 = xp.linalg.norm(vec2)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
if norm1 == 0.0 or norm2 == 0.0:
|
2017-10-31 13:40:46 +03:00
|
|
|
return 0
|
|
|
|
else:
|
|
|
|
return vec1.dot(vec2) / (norm1 * norm2)
|
2017-10-31 04:00:26 +03:00
|
|
|
|
|
|
|
|
2017-11-06 16:11:59 +03:00
|
|
|
def create_default_optimizer(ops, **cfg):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
learn_rate = util.env_opt("learn_rate", 0.001)
|
Revert changes to optimizer default hyper-params (WIP) (#3415)
While developing v2.1, I ran a bunch of hyper-parameter search
experiments to find settings that performed well for spaCy's NER and
parser. I ended up changing the default Adam settings from beta1=0.9,
beta2=0.999, eps=1e-8 to beta1=0.8, beta2=0.8, eps=1e-5. This was giving
a small improvement in accuracy (like, 0.4%).
Months later, I run the models with Prodigy, which uses beam-search
decoding even when the model has been trained with a greedy objective.
The new models performed terribly...So, wtf? After a couple of days
debugging, I figured out that the new optimizer settings was causing the
model to converge to solutions where the top-scoring class often had
a score of like, -80. The variance on the weights had gone up
enormously. I guess I needed to update the L2 regularisation as well?
Anyway. Let's just revert the change --- if the optimizer is finding
such extreme solutions, that seems bad, and not nearly worth the small
improvement in accuracy.
Currently training a slate of models, to verify the accuracy change is minimal.
Once the training is complete, we can merge this.
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2019-03-16 23:39:02 +03:00
|
|
|
beta1 = util.env_opt("optimizer_B1", 0.9)
|
|
|
|
beta2 = util.env_opt("optimizer_B2", 0.999)
|
|
|
|
eps = util.env_opt("optimizer_eps", 1e-8)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
L2 = util.env_opt("L2_penalty", 1e-6)
|
Revert changes to optimizer default hyper-params (WIP) (#3415)
While developing v2.1, I ran a bunch of hyper-parameter search
experiments to find settings that performed well for spaCy's NER and
parser. I ended up changing the default Adam settings from beta1=0.9,
beta2=0.999, eps=1e-8 to beta1=0.8, beta2=0.8, eps=1e-5. This was giving
a small improvement in accuracy (like, 0.4%).
Months later, I run the models with Prodigy, which uses beam-search
decoding even when the model has been trained with a greedy objective.
The new models performed terribly...So, wtf? After a couple of days
debugging, I figured out that the new optimizer settings was causing the
model to converge to solutions where the top-scoring class often had
a score of like, -80. The variance on the weights had gone up
enormously. I guess I needed to update the L2 regularisation as well?
Anyway. Let's just revert the change --- if the optimizer is finding
such extreme solutions, that seems bad, and not nearly worth the small
improvement in accuracy.
Currently training a slate of models, to verify the accuracy change is minimal.
Once the training is complete, we can merge this.
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2019-03-16 23:39:02 +03:00
|
|
|
max_grad_norm = util.env_opt("grad_norm_clip", 1.0)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps)
|
2017-11-06 16:11:59 +03:00
|
|
|
optimizer.max_grad_norm = max_grad_norm
|
2017-11-06 16:25:37 +03:00
|
|
|
optimizer.device = ops.device
|
2017-11-06 16:11:59 +03:00
|
|
|
return optimizer
|
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
2017-07-22 21:03:40 +03:00
|
|
|
@layerize
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
def _flatten_add_lengths(seqs, pad=0, drop=0.0):
|
2017-07-22 21:03:40 +03:00
|
|
|
ops = Model.ops
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2017-07-22 21:03:40 +03:00
|
|
|
def finish_update(d_X, sgd=None):
|
|
|
|
return ops.unflatten(d_X, lengths, pad=pad)
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2017-07-22 21:03:40 +03:00
|
|
|
X = ops.flatten(seqs, pad=pad)
|
|
|
|
return (X, lengths), finish_update
|
|
|
|
|
|
|
|
|
|
|
|
def _zero_init(model):
|
2019-02-23 14:27:45 +03:00
|
|
|
def _zero_init_impl(self, *args, **kwargs):
|
2017-07-22 21:03:40 +03:00
|
|
|
self.W.fill(0)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
2019-02-23 14:27:45 +03:00
|
|
|
model.on_init_hooks.append(_zero_init_impl)
|
2017-07-22 21:03:40 +03:00
|
|
|
if model.W is not None:
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
model.W.fill(0.0)
|
2017-07-22 21:03:40 +03:00
|
|
|
return model
|
|
|
|
|
2017-08-18 22:55:23 +03:00
|
|
|
|
2019-03-09 20:50:08 +03:00
|
|
|
def with_cpu(ops, model):
|
2019-03-09 20:51:17 +03:00
|
|
|
"""Wrap a model that should run on CPU, transferring inputs and outputs
|
|
|
|
as necessary."""
|
2019-03-09 20:50:08 +03:00
|
|
|
model.to_cpu()
|
2019-03-10 21:22:59 +03:00
|
|
|
|
2019-04-01 13:11:27 +03:00
|
|
|
def with_cpu_forward(inputs, drop=0.0):
|
2019-03-09 20:50:08 +03:00
|
|
|
cpu_outputs, backprop = model.begin_update(_to_cpu(inputs), drop=drop)
|
|
|
|
gpu_outputs = _to_device(ops, cpu_outputs)
|
|
|
|
|
|
|
|
def with_cpu_backprop(d_outputs, sgd=None):
|
|
|
|
cpu_d_outputs = _to_cpu(d_outputs)
|
|
|
|
return backprop(cpu_d_outputs, sgd=sgd)
|
|
|
|
|
|
|
|
return gpu_outputs, with_cpu_backprop
|
|
|
|
|
|
|
|
return wrap(with_cpu_forward, model)
|
|
|
|
|
|
|
|
|
|
|
|
def _to_cpu(X):
|
|
|
|
if isinstance(X, numpy.ndarray):
|
|
|
|
return X
|
|
|
|
elif isinstance(X, tuple):
|
|
|
|
return tuple([_to_cpu(x) for x in X])
|
|
|
|
elif isinstance(X, list):
|
|
|
|
return [_to_cpu(x) for x in X]
|
2019-04-01 13:11:27 +03:00
|
|
|
elif hasattr(X, "get"):
|
2019-03-09 20:50:08 +03:00
|
|
|
return X.get()
|
|
|
|
else:
|
|
|
|
return X
|
|
|
|
|
|
|
|
|
|
|
|
def _to_device(ops, X):
|
|
|
|
if isinstance(X, tuple):
|
|
|
|
return tuple([_to_device(ops, x) for x in X])
|
|
|
|
elif isinstance(X, list):
|
|
|
|
return [_to_device(ops, x) for x in X]
|
|
|
|
else:
|
|
|
|
return ops.asarray(X)
|
|
|
|
|
|
|
|
|
2019-03-23 18:44:44 +03:00
|
|
|
class extract_ngrams(Model):
|
|
|
|
def __init__(self, ngram_size, attr=LOWER):
|
|
|
|
Model.__init__(self)
|
|
|
|
self.ngram_size = ngram_size
|
|
|
|
self.attr = attr
|
|
|
|
|
|
|
|
def begin_update(self, docs, drop=0.0):
|
|
|
|
batch_keys = []
|
|
|
|
batch_vals = []
|
|
|
|
for doc in docs:
|
|
|
|
unigrams = doc.to_array([self.attr])
|
|
|
|
ngrams = [unigrams]
|
|
|
|
for n in range(2, self.ngram_size + 1):
|
|
|
|
ngrams.append(self.ops.ngrams(n, unigrams))
|
|
|
|
keys = self.ops.xp.concatenate(ngrams)
|
|
|
|
keys, vals = self.ops.xp.unique(keys, return_counts=True)
|
|
|
|
batch_keys.append(keys)
|
|
|
|
batch_vals.append(vals)
|
|
|
|
# The dtype here matches what thinc is expecting -- which differs per
|
|
|
|
# platform (by int definition). This should be fixed once the problem
|
|
|
|
# is fixed on Thinc's side.
|
2019-04-01 13:11:27 +03:00
|
|
|
lengths = self.ops.asarray(
|
|
|
|
[arr.shape[0] for arr in batch_keys], dtype=numpy.int_
|
|
|
|
)
|
2019-03-23 18:44:44 +03:00
|
|
|
batch_keys = self.ops.xp.concatenate(batch_keys)
|
|
|
|
batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
|
|
|
|
return (batch_keys, batch_vals, lengths), None
|
2017-07-22 21:03:40 +03:00
|
|
|
|
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
@describe.on_data(
|
|
|
|
_set_dimensions_if_needed, lambda model, X, y: model.init_weights(model)
|
|
|
|
)
|
2017-05-08 12:36:37 +03:00
|
|
|
@describe.attributes(
|
|
|
|
nI=Dimension("Input size"),
|
|
|
|
nF=Dimension("Number of features"),
|
|
|
|
nO=Dimension("Output size"),
|
2017-10-20 17:23:31 +03:00
|
|
|
nP=Dimension("Maxout pieces"),
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
W=Synapses("Weights matrix", lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
|
|
|
|
b=Biases("Bias vector", lambda obj: (obj.nO, obj.nP)),
|
|
|
|
pad=Synapses(
|
|
|
|
"Pad",
|
2017-10-28 19:45:14 +03:00
|
|
|
lambda obj: (1, obj.nF, obj.nO, obj.nP),
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
lambda M, ops: ops.normal_init(M, 1.0),
|
|
|
|
),
|
2017-05-08 12:36:37 +03:00
|
|
|
d_W=Gradient("W"),
|
2017-10-28 19:45:14 +03:00
|
|
|
d_pad=Gradient("pad"),
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
d_b=Gradient("b"),
|
|
|
|
)
|
2017-05-08 12:36:37 +03:00
|
|
|
class PrecomputableAffine(Model):
|
2017-10-20 17:23:31 +03:00
|
|
|
def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs):
|
2017-05-08 12:36:37 +03:00
|
|
|
Model.__init__(self, **kwargs)
|
|
|
|
self.nO = nO
|
2017-10-20 17:23:31 +03:00
|
|
|
self.nP = nP
|
2017-05-08 12:36:37 +03:00
|
|
|
self.nI = nI
|
|
|
|
self.nF = nF
|
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
def begin_update(self, X, drop=0.0):
|
|
|
|
Yf = self.ops.gemm(
|
|
|
|
X, self.W.reshape((self.nF * self.nO * self.nP, self.nI)), trans2=True
|
|
|
|
)
|
2017-10-28 19:45:14 +03:00
|
|
|
Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
|
|
|
|
Yf = self._add_padding(Yf)
|
2017-10-19 14:44:49 +03:00
|
|
|
|
2017-05-08 12:36:37 +03:00
|
|
|
def backward(dY_ids, sgd=None):
|
|
|
|
dY, ids = dY_ids
|
2017-10-28 19:45:14 +03:00
|
|
|
dY, ids = self._backprop_padding(dY, ids)
|
2017-05-08 12:36:37 +03:00
|
|
|
Xf = X[ids]
|
2017-10-27 13:18:36 +03:00
|
|
|
Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI))
|
2017-10-20 13:14:52 +03:00
|
|
|
|
2017-10-19 19:42:11 +03:00
|
|
|
self.d_b += dY.sum(axis=0)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
dY = dY.reshape((dY.shape[0], self.nO * self.nP))
|
2017-10-27 13:18:36 +03:00
|
|
|
|
|
|
|
Wopfi = self.W.transpose((1, 2, 0, 3))
|
|
|
|
Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
Wopfi = Wopfi.reshape((self.nO * self.nP, self.nF * self.nI))
|
|
|
|
dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO * self.nP)), Wopfi)
|
2017-10-28 19:45:14 +03:00
|
|
|
|
2017-10-27 13:18:36 +03:00
|
|
|
# Reuse the buffer
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
dWopfi = Wopfi
|
|
|
|
dWopfi.fill(0.0)
|
2018-03-27 20:23:02 +03:00
|
|
|
self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
|
2017-10-27 13:18:36 +03:00
|
|
|
dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
|
|
|
|
# (o, p, f, i) --> (f, o, p, i)
|
|
|
|
self.d_W += dWopfi.transpose((2, 0, 1, 3))
|
2017-05-08 12:36:37 +03:00
|
|
|
|
|
|
|
if sgd is not None:
|
|
|
|
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
2017-10-27 13:18:36 +03:00
|
|
|
return dXf.reshape((dXf.shape[0], self.nF, self.nI))
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
2017-05-08 12:36:37 +03:00
|
|
|
return Yf, backward
|
2018-04-03 16:50:31 +03:00
|
|
|
|
2017-10-28 19:45:14 +03:00
|
|
|
def _add_padding(self, Yf):
|
|
|
|
Yf_padded = self.ops.xp.vstack((self.pad, Yf))
|
2017-10-31 04:33:34 +03:00
|
|
|
return Yf_padded
|
2017-10-28 19:45:14 +03:00
|
|
|
|
|
|
|
def _backprop_padding(self, dY, ids):
|
2017-10-31 04:33:34 +03:00
|
|
|
# (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
mask = ids < 0.0
|
2017-11-03 03:54:34 +03:00
|
|
|
mask = mask.sum(axis=1)
|
|
|
|
d_pad = dY * mask.reshape((ids.shape[0], 1, 1))
|
2017-11-03 02:49:11 +03:00
|
|
|
self.d_pad += d_pad.sum(axis=0)
|
2017-10-28 19:45:14 +03:00
|
|
|
return dY, ids
|
2017-05-08 12:36:37 +03:00
|
|
|
|
2017-10-20 04:07:45 +03:00
|
|
|
@staticmethod
|
|
|
|
def init_weights(model):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
"""This is like the 'layer sequential unit variance', but instead
|
2017-10-20 04:07:45 +03:00
|
|
|
of taking the actual inputs, we randomly generate whitened data.
|
|
|
|
|
|
|
|
Why's this all so complicated? We have a huge number of inputs,
|
|
|
|
and the maxout unit makes guessing the dynamics tricky. Instead
|
|
|
|
we set the maxout weights to values that empirically result in
|
|
|
|
whitened outputs given whitened inputs.
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
"""
|
|
|
|
if (model.W ** 2).sum() != 0.0:
|
2017-10-20 04:07:45 +03:00
|
|
|
return
|
2017-10-31 04:33:34 +03:00
|
|
|
ops = model.ops
|
|
|
|
xp = ops.xp
|
|
|
|
ops.normal_init(model.W, model.nF * model.nI, inplace=True)
|
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
ids = ops.allocate((5000, model.nF), dtype="f")
|
2017-10-31 04:33:34 +03:00
|
|
|
ids += xp.random.uniform(0, 1000, ids.shape)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
ids = ops.asarray(ids, dtype="i")
|
|
|
|
tokvecs = ops.allocate((5000, model.nI), dtype="f")
|
|
|
|
tokvecs += xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
|
|
|
|
tokvecs.shape
|
|
|
|
)
|
2017-10-20 04:07:45 +03:00
|
|
|
|
|
|
|
def predict(ids, tokvecs):
|
2018-05-02 04:35:59 +03:00
|
|
|
# nS ids. nW tokvecs. Exclude the padding array.
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
hiddens = model(tokvecs[:-1]) # (nW, f, o, p)
|
|
|
|
vectors = model.ops.allocate((ids.shape[0], model.nO * model.nP), dtype="f")
|
2017-10-28 20:05:01 +03:00
|
|
|
# need nS vectors
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
hiddens = hiddens.reshape(
|
|
|
|
(hiddens.shape[0] * model.nF, model.nO * model.nP)
|
|
|
|
)
|
2018-05-02 04:35:59 +03:00
|
|
|
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
|
|
|
|
vectors = vectors.reshape((vectors.shape[0], model.nO, model.nP))
|
2017-10-28 20:05:01 +03:00
|
|
|
vectors += model.b
|
2018-05-02 04:35:59 +03:00
|
|
|
vectors = model.ops.asarray(vectors)
|
2017-10-20 04:07:45 +03:00
|
|
|
if model.nP >= 2:
|
2017-10-28 20:05:01 +03:00
|
|
|
return model.ops.maxout(vectors)[0]
|
2017-10-20 04:07:45 +03:00
|
|
|
else:
|
2017-10-28 20:05:01 +03:00
|
|
|
return vectors * (vectors >= 0)
|
2017-10-20 04:07:45 +03:00
|
|
|
|
|
|
|
tol_var = 0.01
|
|
|
|
tol_mean = 0.01
|
|
|
|
t_max = 10
|
|
|
|
t_i = 0
|
|
|
|
for t_i in range(t_max):
|
|
|
|
acts1 = predict(ids, tokvecs)
|
2017-10-31 04:33:34 +03:00
|
|
|
var = model.ops.xp.var(acts1)
|
|
|
|
mean = model.ops.xp.mean(acts1)
|
2017-10-20 04:07:45 +03:00
|
|
|
if abs(var - 1.0) >= tol_var:
|
2017-10-31 04:33:34 +03:00
|
|
|
model.W /= model.ops.xp.sqrt(var)
|
2017-10-20 04:07:45 +03:00
|
|
|
elif abs(mean) >= tol_mean:
|
|
|
|
model.b -= mean
|
|
|
|
else:
|
|
|
|
break
|
|
|
|
|
2017-05-08 15:24:43 +03:00
|
|
|
|
2017-09-22 17:38:36 +03:00
|
|
|
def link_vectors_to_models(vocab):
|
|
|
|
vectors = vocab.vectors
|
2018-03-28 17:02:59 +03:00
|
|
|
if vectors.name is None:
|
2018-03-28 19:24:53 +03:00
|
|
|
vectors.name = VECTORS_KEY
|
2018-05-19 19:51:55 +03:00
|
|
|
if vectors.data.size != 0:
|
2019-09-16 16:16:12 +03:00
|
|
|
user_warning(Warnings.W020.format(shape=vectors.data.shape))
|
2017-09-22 17:38:36 +03:00
|
|
|
ops = Model.ops
|
|
|
|
for word in vocab:
|
|
|
|
if word.orth in vectors.key2row:
|
|
|
|
word.rank = vectors.key2row[word.orth]
|
|
|
|
else:
|
|
|
|
word.rank = 0
|
|
|
|
data = ops.asarray(vectors.data)
|
|
|
|
# Set an entry here, so that vectors are accessed by StaticVectors
|
|
|
|
# (unideal, I know)
|
2019-07-11 13:32:59 +03:00
|
|
|
key = (ops.device, vectors.name)
|
|
|
|
if key in thinc.extra.load_nlp.VECTORS:
|
|
|
|
if thinc.extra.load_nlp.VECTORS[key].shape != data.shape:
|
2020-03-03 15:58:22 +03:00
|
|
|
# This is a hack to avoid the problem in #3853.
|
2019-07-11 15:46:29 +03:00
|
|
|
old_name = vectors.name
|
|
|
|
new_name = vectors.name + "_%d" % data.shape[0]
|
|
|
|
user_warning(Warnings.W019.format(old=old_name, new=new_name))
|
|
|
|
vectors.name = new_name
|
|
|
|
key = (ops.device, vectors.name)
|
2019-07-11 13:32:59 +03:00
|
|
|
thinc.extra.load_nlp.VECTORS[key] = data
|
2017-08-18 22:55:23 +03:00
|
|
|
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2018-09-13 20:28:35 +03:00
|
|
|
def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):
|
2019-10-25 23:28:20 +03:00
|
|
|
import torch.nn
|
|
|
|
from thinc.api import with_square_sequences
|
|
|
|
from thinc.extra.wrappers import PyTorchWrapperRNN
|
2019-10-28 14:43:55 +03:00
|
|
|
|
2018-09-13 20:28:35 +03:00
|
|
|
if depth == 0:
|
2018-09-14 01:54:34 +03:00
|
|
|
return layerize(noop())
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout)
|
2018-09-13 20:28:35 +03:00
|
|
|
return with_square_sequences(PyTorchWrapperRNN(model))
|
|
|
|
|
|
|
|
|
2017-09-21 15:59:48 +03:00
|
|
|
def Tok2Vec(width, embed_size, **kwargs):
|
2019-10-31 17:01:15 +03:00
|
|
|
if not USE_MODEL_REGISTRY_TOK2VEC:
|
|
|
|
# Preserve prior tok2vec for backwards compat, in v2.2.2
|
|
|
|
return _legacy_tok2vec.Tok2Vec(width, embed_size, **kwargs)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
pretrained_vectors = kwargs.get("pretrained_vectors", None)
|
2018-12-09 01:27:29 +03:00
|
|
|
cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
subword_features = kwargs.get("subword_features", True)
|
2019-03-09 14:50:08 +03:00
|
|
|
char_embed = kwargs.get("char_embed", False)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
conv_depth = kwargs.get("conv_depth", 4)
|
|
|
|
bilstm_depth = kwargs.get("bilstm_depth", 0)
|
2019-10-28 17:16:33 +03:00
|
|
|
conv_window = kwargs.get("conv_window", 1)
|
2017-09-22 17:38:36 +03:00
|
|
|
|
2019-10-25 23:28:20 +03:00
|
|
|
cols = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
|
2019-10-28 14:43:55 +03:00
|
|
|
|
2019-10-25 23:28:20 +03:00
|
|
|
doc2feats_cfg = {"arch": "spacy.Doc2Feats.v1", "config": {"columns": cols}}
|
|
|
|
if char_embed:
|
|
|
|
embed_cfg = {
|
|
|
|
"arch": "spacy.CharacterEmbed.v1",
|
|
|
|
"config": {
|
|
|
|
"width": 64,
|
|
|
|
"chars": 6,
|
|
|
|
"@mix": {
|
|
|
|
"arch": "spacy.LayerNormalizedMaxout.v1",
|
2019-10-28 14:43:55 +03:00
|
|
|
"config": {"width": width, "pieces": 3},
|
2019-10-25 23:28:20 +03:00
|
|
|
},
|
2019-10-28 14:43:55 +03:00
|
|
|
"@embed_features": None,
|
|
|
|
},
|
2019-10-25 23:28:20 +03:00
|
|
|
}
|
|
|
|
else:
|
|
|
|
embed_cfg = {
|
|
|
|
"arch": "spacy.MultiHashEmbed.v1",
|
|
|
|
"config": {
|
|
|
|
"width": width,
|
|
|
|
"rows": embed_size,
|
|
|
|
"columns": cols,
|
|
|
|
"use_subwords": subword_features,
|
|
|
|
"@pretrained_vectors": None,
|
|
|
|
"@mix": {
|
|
|
|
"arch": "spacy.LayerNormalizedMaxout.v1",
|
2019-10-28 14:43:55 +03:00
|
|
|
"config": {"width": width, "pieces": 3},
|
2019-10-25 23:28:20 +03:00
|
|
|
},
|
2019-10-28 14:43:55 +03:00
|
|
|
},
|
2019-10-25 23:28:20 +03:00
|
|
|
}
|
|
|
|
if pretrained_vectors:
|
|
|
|
embed_cfg["config"]["@pretrained_vectors"] = {
|
|
|
|
"arch": "spacy.PretrainedVectors.v1",
|
|
|
|
"config": {
|
|
|
|
"vectors_name": pretrained_vectors,
|
|
|
|
"width": width,
|
2019-10-30 21:27:18 +03:00
|
|
|
"column": cols.index("ID"),
|
2019-10-28 14:43:55 +03:00
|
|
|
},
|
2019-10-25 23:28:20 +03:00
|
|
|
}
|
2019-10-28 17:16:33 +03:00
|
|
|
if cnn_maxout_pieces >= 2:
|
|
|
|
cnn_cfg = {
|
|
|
|
"arch": "spacy.MaxoutWindowEncoder.v1",
|
|
|
|
"config": {
|
|
|
|
"width": width,
|
|
|
|
"window_size": conv_window,
|
|
|
|
"pieces": cnn_maxout_pieces,
|
|
|
|
"depth": conv_depth,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
else:
|
|
|
|
cnn_cfg = {
|
|
|
|
"arch": "spacy.MishWindowEncoder.v1",
|
|
|
|
"config": {"width": width, "window_size": conv_window, "depth": conv_depth},
|
|
|
|
}
|
2019-10-25 23:28:20 +03:00
|
|
|
bilstm_cfg = {
|
|
|
|
"arch": "spacy.TorchBiLSTMEncoder.v1",
|
2019-10-28 14:43:55 +03:00
|
|
|
"config": {"width": width, "depth": bilstm_depth},
|
2019-10-25 23:28:20 +03:00
|
|
|
}
|
|
|
|
if conv_depth == 0 and bilstm_depth == 0:
|
|
|
|
encode_cfg = {}
|
|
|
|
elif conv_depth >= 1 and bilstm_depth >= 1:
|
|
|
|
encode_cfg = {
|
|
|
|
"arch": "thinc.FeedForward.v1",
|
2019-10-28 14:43:55 +03:00
|
|
|
"config": {"children": [cnn_cfg, bilstm_cfg]},
|
2019-10-25 23:28:20 +03:00
|
|
|
}
|
|
|
|
elif conv_depth >= 1:
|
|
|
|
encode_cfg = cnn_cfg
|
|
|
|
else:
|
|
|
|
encode_cfg = bilstm_cfg
|
2019-10-28 14:43:55 +03:00
|
|
|
config = {"@doc2feats": doc2feats_cfg, "@embed": embed_cfg, "@encode": encode_cfg}
|
2019-10-25 23:28:20 +03:00
|
|
|
return new_ml.Tok2Vec(config)
|
2017-05-15 22:46:08 +03:00
|
|
|
|
2017-05-04 14:31:40 +03:00
|
|
|
|
2017-09-22 17:37:03 +03:00
|
|
|
def reapply(layer, n_times):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
def reapply_fwd(X, drop=0.0):
|
2017-09-22 17:37:03 +03:00
|
|
|
backprops = []
|
|
|
|
for i in range(n_times):
|
|
|
|
Y, backprop = layer.begin_update(X, drop=drop)
|
|
|
|
X = Y
|
|
|
|
backprops.append(backprop)
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2017-09-22 17:37:03 +03:00
|
|
|
def reapply_bwd(dY, sgd=None):
|
|
|
|
dX = None
|
|
|
|
for backprop in reversed(backprops):
|
|
|
|
dY = backprop(dY, sgd=sgd)
|
|
|
|
if dX is None:
|
|
|
|
dX = dY
|
|
|
|
else:
|
|
|
|
dX += dY
|
|
|
|
return dX
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2017-09-22 17:37:03 +03:00
|
|
|
return Y, reapply_bwd
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
2017-09-22 17:37:03 +03:00
|
|
|
return wrap(reapply_fwd, layer)
|
|
|
|
|
|
|
|
|
2017-05-22 12:47:47 +03:00
|
|
|
def asarray(ops, dtype):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
def forward(X, drop=0.0):
|
2017-05-22 12:47:47 +03:00
|
|
|
return ops.asarray(X, dtype=dtype), None
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
2017-05-22 12:47:47 +03:00
|
|
|
return layerize(forward)
|
|
|
|
|
|
|
|
|
2017-05-20 14:40:10 +03:00
|
|
|
def _divide_array(X, size):
|
|
|
|
parts = []
|
|
|
|
index = 0
|
|
|
|
while index < len(X):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
parts.append(X[index : index + size])
|
2017-05-20 14:40:10 +03:00
|
|
|
index += size
|
|
|
|
return parts
|
|
|
|
|
|
|
|
|
2017-05-04 14:31:40 +03:00
|
|
|
def get_col(idx):
|
2018-04-03 16:50:31 +03:00
|
|
|
if idx < 0:
|
|
|
|
raise IndexError(Errors.E066.format(value=idx))
|
2017-10-27 15:39:30 +03:00
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
def forward(X, drop=0.0):
|
Update draft of parser neural network model
Model is good, but code is messy. Currently requires Chainer, which may cause the build to fail on machines without a GPU.
Outline of the model:
We first predict context-sensitive vectors for each word in the input:
(embed_lower | embed_prefix | embed_suffix | embed_shape)
>> Maxout(token_width)
>> convolution ** 4
This convolutional layer is shared between the tagger and the parser. This prevents the parser from needing tag features.
To boost the representation, we make a "super tag" with POS, morphology and dependency label. The tagger predicts this
by adding a softmax layer onto the convolutional layer --- so, we're teaching the convolutional layer to give us a
representation that's one affine transform from this informative lexical information. This is obviously good for the
parser (which backprops to the convolutions too).
The parser model makes a state vector by concatenating the vector representations for its context tokens. Current
results suggest few context tokens works well. Maybe this is a bug.
The current context tokens:
* S0, S1, S2: Top three words on the stack
* B0, B1: First two words of the buffer
* S0L1, S0L2: Leftmost and second leftmost children of S0
* S0R1, S0R2: Rightmost and second rightmost children of S0
* S1L1, S1L2, S1R2, S1R, B0L1, B0L2: Likewise for S1 and B0
This makes the state vector quite long: 13*T, where T is the token vector width (128 is working well). Fortunately,
there's a way to structure the computation to save some expense (and make it more GPU friendly).
The parser typically visits 2*N states for a sentence of length N (although it may visit more, if it back-tracks
with a non-monotonic transition). A naive implementation would require 2*N (B, 13*T) @ (13*T, H) matrix multiplications
for a batch of size B. We can instead perform one (B*N, T) @ (T, 13*H) multiplication, to pre-compute the hidden
weights for each positional feature wrt the words in the batch. (Note that our token vectors come from the CNN
-- so we can't play this trick over the vocabulary. That's how Stanford's NN parser works --- and why its model
is so big.)
This pre-computation strategy allows a nice compromise between GPU-friendliness and implementation simplicity.
The CNN and the wide lower layer are computed on the GPU, and then the precomputed hidden weights are moved
to the CPU, before we start the transition-based parsing process. This makes a lot of things much easier.
We don't have to worry about variable-length batch sizes, and we don't have to implement the dynamic oracle
in CUDA to train.
Currently the parser's loss function is multilabel log loss, as the dynamic oracle allows multiple states to
be 0 cost. This is defined as:
(exp(score) / Z) - (exp(score) / gZ)
Where gZ is the sum of the scores assigned to gold classes. I'm very interested in regressing on the cost directly,
but so far this isn't working well.
Machinery is in place for beam-search, which has been working well for the linear model. Beam search should benefit
greatly from the pre-computation trick.
2017-05-13 00:09:15 +03:00
|
|
|
if isinstance(X, numpy.ndarray):
|
|
|
|
ops = NumpyOps()
|
|
|
|
else:
|
|
|
|
ops = CupyOps()
|
2017-05-18 14:21:32 +03:00
|
|
|
output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2017-05-06 21:38:12 +03:00
|
|
|
def backward(y, sgd=None):
|
Update draft of parser neural network model
Model is good, but code is messy. Currently requires Chainer, which may cause the build to fail on machines without a GPU.
Outline of the model:
We first predict context-sensitive vectors for each word in the input:
(embed_lower | embed_prefix | embed_suffix | embed_shape)
>> Maxout(token_width)
>> convolution ** 4
This convolutional layer is shared between the tagger and the parser. This prevents the parser from needing tag features.
To boost the representation, we make a "super tag" with POS, morphology and dependency label. The tagger predicts this
by adding a softmax layer onto the convolutional layer --- so, we're teaching the convolutional layer to give us a
representation that's one affine transform from this informative lexical information. This is obviously good for the
parser (which backprops to the convolutions too).
The parser model makes a state vector by concatenating the vector representations for its context tokens. Current
results suggest few context tokens works well. Maybe this is a bug.
The current context tokens:
* S0, S1, S2: Top three words on the stack
* B0, B1: First two words of the buffer
* S0L1, S0L2: Leftmost and second leftmost children of S0
* S0R1, S0R2: Rightmost and second rightmost children of S0
* S1L1, S1L2, S1R2, S1R, B0L1, B0L2: Likewise for S1 and B0
This makes the state vector quite long: 13*T, where T is the token vector width (128 is working well). Fortunately,
there's a way to structure the computation to save some expense (and make it more GPU friendly).
The parser typically visits 2*N states for a sentence of length N (although it may visit more, if it back-tracks
with a non-monotonic transition). A naive implementation would require 2*N (B, 13*T) @ (13*T, H) matrix multiplications
for a batch of size B. We can instead perform one (B*N, T) @ (T, 13*H) multiplication, to pre-compute the hidden
weights for each positional feature wrt the words in the batch. (Note that our token vectors come from the CNN
-- so we can't play this trick over the vocabulary. That's how Stanford's NN parser works --- and why its model
is so big.)
This pre-computation strategy allows a nice compromise between GPU-friendliness and implementation simplicity.
The CNN and the wide lower layer are computed on the GPU, and then the precomputed hidden weights are moved
to the CPU, before we start the transition-based parsing process. This makes a lot of things much easier.
We don't have to worry about variable-length batch sizes, and we don't have to implement the dynamic oracle
in CUDA to train.
Currently the parser's loss function is multilabel log loss, as the dynamic oracle allows multiple states to
be 0 cost. This is defined as:
(exp(score) / Z) - (exp(score) / gZ)
Where gZ is the sum of the scores assigned to gold classes. I'm very interested in regressing on the cost directly,
but so far this isn't working well.
Machinery is in place for beam-search, which has been working well for the linear model. Beam search should benefit
greatly from the pre-computation trick.
2017-05-13 00:09:15 +03:00
|
|
|
dX = ops.allocate(X.shape)
|
2017-05-06 21:38:12 +03:00
|
|
|
dX[:, idx] += y
|
|
|
|
return dX
|
2017-05-04 14:31:40 +03:00
|
|
|
|
2017-10-27 15:39:30 +03:00
|
|
|
return output, backward
|
2017-05-04 14:31:40 +03:00
|
|
|
|
2017-10-27 15:39:30 +03:00
|
|
|
return layerize(forward)
|
Update draft of parser neural network model
Model is good, but code is messy. Currently requires Chainer, which may cause the build to fail on machines without a GPU.
Outline of the model:
We first predict context-sensitive vectors for each word in the input:
(embed_lower | embed_prefix | embed_suffix | embed_shape)
>> Maxout(token_width)
>> convolution ** 4
This convolutional layer is shared between the tagger and the parser. This prevents the parser from needing tag features.
To boost the representation, we make a "super tag" with POS, morphology and dependency label. The tagger predicts this
by adding a softmax layer onto the convolutional layer --- so, we're teaching the convolutional layer to give us a
representation that's one affine transform from this informative lexical information. This is obviously good for the
parser (which backprops to the convolutions too).
The parser model makes a state vector by concatenating the vector representations for its context tokens. Current
results suggest few context tokens works well. Maybe this is a bug.
The current context tokens:
* S0, S1, S2: Top three words on the stack
* B0, B1: First two words of the buffer
* S0L1, S0L2: Leftmost and second leftmost children of S0
* S0R1, S0R2: Rightmost and second rightmost children of S0
* S1L1, S1L2, S1R2, S1R, B0L1, B0L2: Likewise for S1 and B0
This makes the state vector quite long: 13*T, where T is the token vector width (128 is working well). Fortunately,
there's a way to structure the computation to save some expense (and make it more GPU friendly).
The parser typically visits 2*N states for a sentence of length N (although it may visit more, if it back-tracks
with a non-monotonic transition). A naive implementation would require 2*N (B, 13*T) @ (13*T, H) matrix multiplications
for a batch of size B. We can instead perform one (B*N, T) @ (T, 13*H) multiplication, to pre-compute the hidden
weights for each positional feature wrt the words in the batch. (Note that our token vectors come from the CNN
-- so we can't play this trick over the vocabulary. That's how Stanford's NN parser works --- and why its model
is so big.)
This pre-computation strategy allows a nice compromise between GPU-friendliness and implementation simplicity.
The CNN and the wide lower layer are computed on the GPU, and then the precomputed hidden weights are moved
to the CPU, before we start the transition-based parsing process. This makes a lot of things much easier.
We don't have to worry about variable-length batch sizes, and we don't have to implement the dynamic oracle
in CUDA to train.
Currently the parser's loss function is multilabel log loss, as the dynamic oracle allows multiple states to
be 0 cost. This is defined as:
(exp(score) / Z) - (exp(score) / gZ)
Where gZ is the sum of the scores assigned to gold classes. I'm very interested in regressing on the cost directly,
but so far this isn't working well.
Machinery is in place for beam-search, which has been working well for the linear model. Beam search should benefit
greatly from the pre-computation trick.
2017-05-13 00:09:15 +03:00
|
|
|
|
|
|
|
|
|
|
|
def doc2feats(cols=None):
|
2017-08-18 22:55:23 +03:00
|
|
|
if cols is None:
|
|
|
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
2017-10-27 15:39:30 +03:00
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
def forward(docs, drop=0.0):
|
2017-05-18 12:22:20 +03:00
|
|
|
feats = []
|
|
|
|
for doc in docs:
|
2017-05-22 12:47:47 +03:00
|
|
|
feats.append(doc.to_array(cols))
|
2017-05-07 03:02:43 +03:00
|
|
|
return feats, None
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2017-05-06 17:47:15 +03:00
|
|
|
model = layerize(forward)
|
Update draft of parser neural network model
Model is good, but code is messy. Currently requires Chainer, which may cause the build to fail on machines without a GPU.
Outline of the model:
We first predict context-sensitive vectors for each word in the input:
(embed_lower | embed_prefix | embed_suffix | embed_shape)
>> Maxout(token_width)
>> convolution ** 4
This convolutional layer is shared between the tagger and the parser. This prevents the parser from needing tag features.
To boost the representation, we make a "super tag" with POS, morphology and dependency label. The tagger predicts this
by adding a softmax layer onto the convolutional layer --- so, we're teaching the convolutional layer to give us a
representation that's one affine transform from this informative lexical information. This is obviously good for the
parser (which backprops to the convolutions too).
The parser model makes a state vector by concatenating the vector representations for its context tokens. Current
results suggest few context tokens works well. Maybe this is a bug.
The current context tokens:
* S0, S1, S2: Top three words on the stack
* B0, B1: First two words of the buffer
* S0L1, S0L2: Leftmost and second leftmost children of S0
* S0R1, S0R2: Rightmost and second rightmost children of S0
* S1L1, S1L2, S1R2, S1R, B0L1, B0L2: Likewise for S1 and B0
This makes the state vector quite long: 13*T, where T is the token vector width (128 is working well). Fortunately,
there's a way to structure the computation to save some expense (and make it more GPU friendly).
The parser typically visits 2*N states for a sentence of length N (although it may visit more, if it back-tracks
with a non-monotonic transition). A naive implementation would require 2*N (B, 13*T) @ (13*T, H) matrix multiplications
for a batch of size B. We can instead perform one (B*N, T) @ (T, 13*H) multiplication, to pre-compute the hidden
weights for each positional feature wrt the words in the batch. (Note that our token vectors come from the CNN
-- so we can't play this trick over the vocabulary. That's how Stanford's NN parser works --- and why its model
is so big.)
This pre-computation strategy allows a nice compromise between GPU-friendliness and implementation simplicity.
The CNN and the wide lower layer are computed on the GPU, and then the precomputed hidden weights are moved
to the CPU, before we start the transition-based parsing process. This makes a lot of things much easier.
We don't have to worry about variable-length batch sizes, and we don't have to implement the dynamic oracle
in CUDA to train.
Currently the parser's loss function is multilabel log loss, as the dynamic oracle allows multiple states to
be 0 cost. This is defined as:
(exp(score) / Z) - (exp(score) / gZ)
Where gZ is the sum of the scores assigned to gold classes. I'm very interested in regressing on the cost directly,
but so far this isn't working well.
Machinery is in place for beam-search, which has been working well for the linear model. Beam search should benefit
greatly from the pre-computation trick.
2017-05-13 00:09:15 +03:00
|
|
|
model.cols = cols
|
2017-05-06 17:47:15 +03:00
|
|
|
return model
|
|
|
|
|
2017-05-27 23:50:40 +03:00
|
|
|
|
2017-05-06 21:38:12 +03:00
|
|
|
def print_shape(prefix):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
def forward(X, drop=0.0):
|
2017-05-06 21:38:12 +03:00
|
|
|
return X, lambda dX, **kwargs: dX
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
2017-05-06 21:38:12 +03:00
|
|
|
return layerize(forward)
|
2017-05-07 04:57:26 +03:00
|
|
|
|
2017-05-06 21:38:12 +03:00
|
|
|
|
|
|
|
@layerize
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
def get_token_vectors(tokens_attrs_vectors, drop=0.0):
|
2017-05-06 21:38:12 +03:00
|
|
|
tokens, attrs, vectors = tokens_attrs_vectors
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2017-05-06 21:38:12 +03:00
|
|
|
def backward(d_output, sgd=None):
|
|
|
|
return (tokens, d_output)
|
2017-09-04 17:26:38 +03:00
|
|
|
|
2017-10-27 15:39:30 +03:00
|
|
|
return vectors, backward
|
2017-07-20 01:17:17 +03:00
|
|
|
|
|
|
|
|
|
|
|
@layerize
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
def logistic(X, drop=0.0):
|
2017-07-20 01:17:17 +03:00
|
|
|
xp = get_array_module(X)
|
|
|
|
if not isinstance(X, xp.ndarray):
|
|
|
|
X = xp.asarray(X)
|
|
|
|
# Clip to range (-10, 10)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
X = xp.minimum(X, 10.0, X)
|
|
|
|
X = xp.maximum(X, -10.0, X)
|
|
|
|
Y = 1.0 / (1.0 + xp.exp(-X))
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2017-07-20 01:17:17 +03:00
|
|
|
def logistic_bwd(dY, sgd=None):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
dX = dY * (Y * (1 - Y))
|
2017-07-20 01:17:17 +03:00
|
|
|
return dX
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2017-07-20 01:17:17 +03:00
|
|
|
return Y, logistic_bwd
|
|
|
|
|
|
|
|
|
|
|
|
def zero_init(model):
|
|
|
|
def _zero_init_impl(self, X, y):
|
|
|
|
self.W.fill(0)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
2017-07-20 01:17:17 +03:00
|
|
|
model.on_data_hooks.append(_zero_init_impl)
|
|
|
|
return model
|
|
|
|
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2017-08-18 22:55:23 +03:00
|
|
|
def getitem(i):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
def getitem_fwd(X, drop=0.0):
|
2017-08-18 22:55:23 +03:00
|
|
|
return X[i], None
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
2017-08-18 22:55:23 +03:00
|
|
|
return layerize(getitem_fwd)
|
|
|
|
|
2017-10-27 13:16:41 +03:00
|
|
|
|
2018-09-24 18:35:28 +03:00
|
|
|
@describe.attributes(
|
2019-03-08 15:28:53 +03:00
|
|
|
W=Synapses("Weights matrix", lambda obj: (obj.nO, obj.nI), lambda W, ops: None)
|
2018-09-24 18:35:28 +03:00
|
|
|
)
|
|
|
|
class MultiSoftmax(Affine):
|
2019-03-08 15:28:53 +03:00
|
|
|
"""Neural network layer that predicts several multi-class attributes at once.
|
2018-09-24 18:35:28 +03:00
|
|
|
For instance, we might predict one class with 6 variables, and another with 5.
|
|
|
|
We predict the 11 neurons required for this, and then softmax them such
|
|
|
|
that columns 0-6 make a probability distribution and coumns 6-11 make another.
|
2019-03-08 15:28:53 +03:00
|
|
|
"""
|
|
|
|
|
|
|
|
name = "multisoftmax"
|
2018-09-24 18:35:28 +03:00
|
|
|
|
|
|
|
def __init__(self, out_sizes, nI=None, **kwargs):
|
|
|
|
Model.__init__(self, **kwargs)
|
|
|
|
self.out_sizes = out_sizes
|
|
|
|
self.nO = sum(out_sizes)
|
|
|
|
self.nI = nI
|
|
|
|
|
|
|
|
def predict(self, input__BI):
|
|
|
|
output__BO = self.ops.affine(self.W, self.b, input__BI)
|
|
|
|
i = 0
|
|
|
|
for out_size in self.out_sizes:
|
2019-03-08 15:28:53 +03:00
|
|
|
self.ops.softmax(output__BO[:, i : i + out_size], inplace=True)
|
2018-09-24 18:35:28 +03:00
|
|
|
i += out_size
|
|
|
|
return output__BO
|
|
|
|
|
2019-03-08 15:28:53 +03:00
|
|
|
def begin_update(self, input__BI, drop=0.0):
|
2018-09-24 18:35:28 +03:00
|
|
|
output__BO = self.predict(input__BI)
|
2019-03-08 15:28:53 +03:00
|
|
|
|
2018-09-24 18:35:28 +03:00
|
|
|
def finish_update(grad__BO, sgd=None):
|
|
|
|
self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True)
|
|
|
|
self.d_b += grad__BO.sum(axis=0)
|
|
|
|
grad__BI = self.ops.gemm(grad__BO, self.W)
|
|
|
|
if sgd is not None:
|
|
|
|
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
|
|
|
return grad__BI
|
2019-03-08 15:28:53 +03:00
|
|
|
|
2018-09-24 18:35:28 +03:00
|
|
|
return output__BO, finish_update
|
|
|
|
|
|
|
|
|
2018-09-25 11:57:59 +03:00
|
|
|
def build_tagger_model(nr_class, **cfg):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
embed_size = util.env_opt("embed_size", 2000)
|
|
|
|
if "token_vector_width" in cfg:
|
|
|
|
token_vector_width = cfg["token_vector_width"]
|
2018-09-25 11:57:59 +03:00
|
|
|
else:
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
token_vector_width = util.env_opt("token_vector_width", 96)
|
|
|
|
pretrained_vectors = cfg.get("pretrained_vectors")
|
|
|
|
subword_features = cfg.get("subword_features", True)
|
|
|
|
with Model.define_operators({">>": chain, "+": add}):
|
|
|
|
if "tok2vec" in cfg:
|
|
|
|
tok2vec = cfg["tok2vec"]
|
2018-09-25 11:57:59 +03:00
|
|
|
else:
|
2019-03-08 15:28:53 +03:00
|
|
|
tok2vec = Tok2Vec(
|
|
|
|
token_vector_width,
|
|
|
|
embed_size,
|
|
|
|
subword_features=subword_features,
|
|
|
|
pretrained_vectors=pretrained_vectors,
|
|
|
|
)
|
|
|
|
softmax = with_flatten(Softmax(nr_class, token_vector_width))
|
|
|
|
model = tok2vec >> softmax
|
2018-09-25 11:57:59 +03:00
|
|
|
model.nI = None
|
|
|
|
model.tok2vec = tok2vec
|
|
|
|
model.softmax = softmax
|
|
|
|
return model
|
|
|
|
|
2019-03-08 15:28:53 +03:00
|
|
|
|
2018-09-25 11:57:59 +03:00
|
|
|
def build_morphologizer_model(class_nums, **cfg):
|
2019-03-08 15:28:53 +03:00
|
|
|
embed_size = util.env_opt("embed_size", 7000)
|
|
|
|
if "token_vector_width" in cfg:
|
|
|
|
token_vector_width = cfg["token_vector_width"]
|
2017-09-21 21:07:26 +03:00
|
|
|
else:
|
2019-03-08 15:28:53 +03:00
|
|
|
token_vector_width = util.env_opt("token_vector_width", 128)
|
|
|
|
pretrained_vectors = cfg.get("pretrained_vectors")
|
2019-03-09 14:50:08 +03:00
|
|
|
char_embed = cfg.get("char_embed", True)
|
2019-03-10 01:54:59 +03:00
|
|
|
with Model.define_operators({">>": chain, "+": add, "**": clone}):
|
2019-03-08 15:28:53 +03:00
|
|
|
if "tok2vec" in cfg:
|
|
|
|
tok2vec = cfg["tok2vec"]
|
2017-09-26 13:51:52 +03:00
|
|
|
else:
|
2019-03-08 15:28:53 +03:00
|
|
|
tok2vec = Tok2Vec(
|
|
|
|
token_vector_width,
|
|
|
|
embed_size,
|
2019-03-09 14:50:08 +03:00
|
|
|
char_embed=char_embed,
|
2019-03-08 15:28:53 +03:00
|
|
|
pretrained_vectors=pretrained_vectors,
|
|
|
|
)
|
2019-09-11 15:00:36 +03:00
|
|
|
softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
|
2018-09-25 23:12:32 +03:00
|
|
|
softmax.out_sizes = class_nums
|
2019-03-08 15:28:53 +03:00
|
|
|
model = tok2vec >> softmax
|
2017-08-18 22:55:23 +03:00
|
|
|
model.nI = None
|
2017-09-21 15:59:48 +03:00
|
|
|
model.tok2vec = tok2vec
|
2017-11-03 13:22:01 +03:00
|
|
|
model.softmax = softmax
|
2017-08-18 22:55:23 +03:00
|
|
|
return model
|
|
|
|
|
2017-08-06 02:13:23 +03:00
|
|
|
|
2017-09-01 17:39:55 +03:00
|
|
|
@layerize
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
def SpacyVectors(docs, drop=0.0):
|
2017-09-01 17:39:55 +03:00
|
|
|
batch = []
|
|
|
|
for doc in docs:
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
indices = numpy.zeros((len(doc),), dtype="i")
|
2017-09-01 17:39:55 +03:00
|
|
|
for i, word in enumerate(doc):
|
|
|
|
if word.orth in doc.vocab.vectors.key2row:
|
|
|
|
indices[i] = doc.vocab.vectors.key2row[word.orth]
|
|
|
|
else:
|
|
|
|
indices[i] = 0
|
|
|
|
vectors = doc.vocab.vectors.data[indices]
|
|
|
|
batch.append(vectors)
|
|
|
|
return batch, None
|
|
|
|
|
|
|
|
|
2017-07-20 01:17:17 +03:00
|
|
|
def build_text_classifier(nr_class, width=64, **cfg):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
depth = cfg.get("depth", 2)
|
|
|
|
nr_vector = cfg.get("nr_vector", 5000)
|
|
|
|
pretrained_dims = cfg.get("pretrained_dims", 0)
|
|
|
|
with Model.define_operators({">>": chain, "+": add, "|": concatenate, "**": clone}):
|
|
|
|
if cfg.get("low_data") and pretrained_dims:
|
2017-09-02 15:56:30 +03:00
|
|
|
model = (
|
|
|
|
SpacyVectors
|
|
|
|
>> flatten_add_lengths
|
2017-10-27 15:39:30 +03:00
|
|
|
>> with_getitem(0, Affine(width, pretrained_dims))
|
2017-09-02 15:56:30 +03:00
|
|
|
>> ParametricAttention(width)
|
|
|
|
>> Pooling(sum_pool)
|
|
|
|
>> Residual(ReLu(width, width)) ** 2
|
|
|
|
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
|
|
|
>> logistic
|
|
|
|
)
|
|
|
|
return model
|
2017-07-20 01:17:17 +03:00
|
|
|
|
2017-09-01 17:39:55 +03:00
|
|
|
lower = HashEmbed(width, nr_vector, column=1)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
prefix = HashEmbed(width // 2, nr_vector, column=2)
|
|
|
|
suffix = HashEmbed(width // 2, nr_vector, column=3)
|
|
|
|
shape = HashEmbed(width // 2, nr_vector, column=4)
|
|
|
|
|
|
|
|
trained_vectors = FeatureExtracter(
|
|
|
|
[ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
|
|
|
) >> with_flatten(
|
|
|
|
uniqued(
|
|
|
|
(lower | prefix | suffix | shape)
|
|
|
|
>> LN(Maxout(width, width + (width // 2) * 3)),
|
|
|
|
column=0,
|
2017-07-20 01:17:17 +03:00
|
|
|
)
|
2017-09-01 17:39:55 +03:00
|
|
|
)
|
|
|
|
|
2017-10-04 15:55:15 +03:00
|
|
|
if pretrained_dims:
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
static_vectors = SpacyVectors >> with_flatten(
|
|
|
|
Affine(width, pretrained_dims)
|
2017-10-04 15:55:15 +03:00
|
|
|
)
|
2017-09-01 17:39:55 +03:00
|
|
|
# TODO Make concatenate support lists
|
2017-10-04 15:55:15 +03:00
|
|
|
vectors = concatenate_lists(trained_vectors, static_vectors)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
vectors_width = width * 2
|
2017-10-04 15:55:15 +03:00
|
|
|
else:
|
|
|
|
vectors = trained_vectors
|
|
|
|
vectors_width = width
|
|
|
|
static_vectors = None
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
tok2vec = vectors >> with_flatten(
|
|
|
|
LN(Maxout(width, vectors_width))
|
|
|
|
>> Residual((ExtractWindow(nW=1) >> LN(Maxout(width, width * 3)))) ** depth,
|
|
|
|
pad=depth,
|
2018-11-03 01:51:37 +03:00
|
|
|
)
|
2017-10-04 15:55:15 +03:00
|
|
|
cnn_model = (
|
2018-11-03 01:51:37 +03:00
|
|
|
tok2vec
|
2017-09-02 15:56:30 +03:00
|
|
|
>> flatten_add_lengths
|
2017-09-01 17:39:55 +03:00
|
|
|
>> ParametricAttention(width)
|
2017-07-25 19:57:59 +03:00
|
|
|
>> Pooling(sum_pool)
|
2017-09-02 15:56:30 +03:00
|
|
|
>> Residual(zero_init(Maxout(width, width)))
|
2017-07-25 19:57:59 +03:00
|
|
|
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
2017-07-23 15:10:51 +03:00
|
|
|
)
|
2017-09-01 17:39:55 +03:00
|
|
|
|
2019-03-23 18:44:44 +03:00
|
|
|
linear_model = build_bow_text_classifier(
|
2020-03-29 14:52:34 +03:00
|
|
|
nr_class,
|
|
|
|
ngram_size=cfg.get("ngram_size", 1),
|
|
|
|
exclusive_classes=cfg.get("exclusive_classes", False),
|
2019-03-09 20:50:08 +03:00
|
|
|
)
|
2020-03-29 14:52:34 +03:00
|
|
|
if cfg.get("exclusive_classes", False):
|
2019-02-23 13:57:16 +03:00
|
|
|
output_layer = Softmax(nr_class, nr_class * 2)
|
|
|
|
else:
|
|
|
|
output_layer = (
|
2019-03-08 15:28:53 +03:00
|
|
|
zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic
|
2019-02-23 13:57:16 +03:00
|
|
|
)
|
2019-03-08 15:28:53 +03:00
|
|
|
model = (linear_model | cnn_model) >> output_layer
|
2019-02-23 13:57:16 +03:00
|
|
|
model.tok2vec = chain(tok2vec, flatten)
|
2017-10-04 17:07:30 +03:00
|
|
|
model.nO = nr_class
|
2017-07-20 01:17:17 +03:00
|
|
|
model.lsuv = False
|
|
|
|
return model
|
|
|
|
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2019-04-01 13:11:27 +03:00
|
|
|
def build_bow_text_classifier(
|
|
|
|
nr_class, ngram_size=1, exclusive_classes=False, no_output_layer=False, **cfg
|
|
|
|
):
|
2019-03-23 18:44:44 +03:00
|
|
|
with Model.define_operators({">>": chain}):
|
2019-04-01 13:11:27 +03:00
|
|
|
model = with_cpu(
|
|
|
|
Model.ops, extract_ngrams(ngram_size, attr=ORTH) >> LinearModel(nr_class)
|
2019-03-23 18:44:44 +03:00
|
|
|
)
|
|
|
|
if not no_output_layer:
|
|
|
|
model = model >> (cpu_softmax if exclusive_classes else logistic)
|
|
|
|
model.nO = nr_class
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
@layerize
|
2019-04-01 13:11:27 +03:00
|
|
|
def cpu_softmax(X, drop=0.0):
|
2019-03-23 18:44:44 +03:00
|
|
|
ops = NumpyOps()
|
|
|
|
|
|
|
|
def cpu_softmax_backward(dY, sgd=None):
|
|
|
|
return dY
|
|
|
|
|
|
|
|
return ops.softmax(X), cpu_softmax_backward
|
|
|
|
|
|
|
|
|
2019-02-23 13:57:16 +03:00
|
|
|
def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg):
|
2018-12-10 16:37:39 +03:00
|
|
|
"""
|
|
|
|
Build a simple CNN text classifier, given a token-to-vector model as inputs.
|
|
|
|
If exclusive_classes=True, a softmax non-linearity is applied, so that the
|
|
|
|
outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
|
|
|
|
is applied instead, so that outputs are in the range [0, 1].
|
|
|
|
"""
|
|
|
|
with Model.define_operators({">>": chain}):
|
|
|
|
if exclusive_classes:
|
|
|
|
output_layer = Softmax(nr_class, tok2vec.nO)
|
|
|
|
else:
|
2019-03-08 15:28:53 +03:00
|
|
|
output_layer = (
|
|
|
|
zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic
|
|
|
|
)
|
💫 Better support for semi-supervised learning (#3035)
The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train
One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.
Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.
Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.
Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:
python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze
Implement rehearsal methods for pipeline components
The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:
Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.
Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.
Implement rehearsal updates for tagger
Implement rehearsal updates for text categoriz
2018-12-10 18:25:33 +03:00
|
|
|
model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer
|
2018-12-10 16:37:39 +03:00
|
|
|
model.tok2vec = chain(tok2vec, flatten)
|
|
|
|
model.nO = nr_class
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
2019-06-29 15:52:36 +03:00
|
|
|
def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
|
2019-06-28 09:29:31 +03:00
|
|
|
if "entity_width" not in cfg:
|
2019-07-15 12:42:50 +03:00
|
|
|
raise ValueError(Errors.E144.format(param="entity_width"))
|
2019-06-28 09:29:31 +03:00
|
|
|
|
2019-06-03 22:32:54 +03:00
|
|
|
conv_depth = cfg.get("conv_depth", 2)
|
|
|
|
cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
|
2019-07-15 13:04:45 +03:00
|
|
|
pretrained_vectors = cfg.get("pretrained_vectors", None)
|
2019-08-13 16:38:59 +03:00
|
|
|
context_width = cfg.get("entity_width")
|
2019-06-03 22:32:54 +03:00
|
|
|
|
|
|
|
with Model.define_operators({">>": chain, "**": clone}):
|
2019-06-28 09:29:31 +03:00
|
|
|
# context encoder
|
2019-08-13 16:38:59 +03:00
|
|
|
tok2vec = Tok2Vec(
|
2019-08-18 16:09:16 +03:00
|
|
|
width=hidden_width,
|
|
|
|
embed_size=embed_width,
|
|
|
|
pretrained_vectors=pretrained_vectors,
|
|
|
|
cnn_maxout_pieces=cnn_maxout_pieces,
|
|
|
|
subword_features=True,
|
|
|
|
conv_depth=conv_depth,
|
|
|
|
bilstm_depth=0,
|
2019-07-11 13:02:25 +03:00
|
|
|
)
|
2019-06-28 09:29:31 +03:00
|
|
|
|
2019-08-13 16:38:59 +03:00
|
|
|
model = (
|
|
|
|
tok2vec
|
2019-07-11 13:02:25 +03:00
|
|
|
>> flatten_add_lengths
|
|
|
|
>> Pooling(mean_pool)
|
|
|
|
>> Residual(zero_init(Maxout(hidden_width, hidden_width)))
|
2019-08-13 16:38:59 +03:00
|
|
|
>> zero_init(Affine(context_width, hidden_width, drop_factor=0.0))
|
2019-07-11 13:02:25 +03:00
|
|
|
)
|
2019-06-28 09:29:31 +03:00
|
|
|
|
|
|
|
model.tok2vec = tok2vec
|
2019-08-13 16:38:59 +03:00
|
|
|
model.nO = context_width
|
2019-06-28 09:29:31 +03:00
|
|
|
return model
|
2019-06-03 22:32:54 +03:00
|
|
|
|
2019-07-11 13:02:25 +03:00
|
|
|
|
2017-09-01 17:39:55 +03:00
|
|
|
@layerize
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
def flatten(seqs, drop=0.0):
|
2017-09-01 17:39:55 +03:00
|
|
|
ops = Model.ops
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2017-09-01 17:39:55 +03:00
|
|
|
def finish_update(d_X, sgd=None):
|
|
|
|
return ops.unflatten(d_X, lengths, pad=0)
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2017-09-01 17:39:55 +03:00
|
|
|
X = ops.flatten(seqs, pad=0)
|
|
|
|
return X, finish_update
|
|
|
|
|
|
|
|
|
2017-10-27 15:39:30 +03:00
|
|
|
def concatenate_lists(*layers, **kwargs): # pragma: no cover
|
|
|
|
"""Compose two or more models `f`, `g`, etc, such that their outputs are
|
2017-09-01 17:39:55 +03:00
|
|
|
concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
|
2017-10-27 15:39:30 +03:00
|
|
|
"""
|
2017-09-01 17:39:55 +03:00
|
|
|
if not layers:
|
|
|
|
return noop()
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
drop_factor = kwargs.get("drop_factor", 1.0)
|
2017-09-01 17:39:55 +03:00
|
|
|
ops = layers[0].ops
|
|
|
|
layers = [chain(layer, flatten) for layer in layers]
|
|
|
|
concat = concatenate(*layers)
|
2017-10-27 15:39:30 +03:00
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
def concatenate_lists_fwd(Xs, drop=0.0):
|
2019-03-09 14:50:08 +03:00
|
|
|
if drop is not None:
|
|
|
|
drop *= drop_factor
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
lengths = ops.asarray([len(X) for X in Xs], dtype="i")
|
2017-09-01 17:39:55 +03:00
|
|
|
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
|
|
|
|
ys = ops.unflatten(flat_y, lengths)
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2017-09-01 17:39:55 +03:00
|
|
|
def concatenate_lists_bwd(d_ys, sgd=None):
|
|
|
|
return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2017-09-01 17:39:55 +03:00
|
|
|
return ys, concatenate_lists_bwd
|
2017-10-27 15:39:30 +03:00
|
|
|
|
2017-09-01 17:39:55 +03:00
|
|
|
model = wrap(concatenate_lists_fwd, concat)
|
|
|
|
return model
|
💫 Better support for semi-supervised learning (#3035)
The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train
One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.
Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.
Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.
Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:
python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze
Implement rehearsal methods for pipeline components
The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:
Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.
Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.
Implement rehearsal updates for tagger
Implement rehearsal updates for text categoriz
2018-12-10 18:25:33 +03:00
|
|
|
|
|
|
|
|
|
|
|
def masked_language_model(vocab, model, mask_prob=0.15):
|
|
|
|
"""Convert a model into a BERT-style masked language model"""
|
|
|
|
|
|
|
|
random_words = _RandomWords(vocab)
|
|
|
|
|
|
|
|
def mlm_forward(docs, drop=0.0):
|
|
|
|
mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
|
|
|
|
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
|
|
|
|
output, backprop = model.begin_update(docs, drop=drop)
|
|
|
|
|
|
|
|
def mlm_backward(d_output, sgd=None):
|
|
|
|
d_output *= 1 - mask
|
|
|
|
return backprop(d_output, sgd=sgd)
|
|
|
|
|
|
|
|
return output, mlm_backward
|
|
|
|
|
|
|
|
return wrap(mlm_forward, model)
|
|
|
|
|
|
|
|
|
|
|
|
class _RandomWords(object):
|
|
|
|
def __init__(self, vocab):
|
|
|
|
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
|
|
|
|
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
|
|
|
|
self.words = self.words[:10000]
|
|
|
|
self.probs = self.probs[:10000]
|
|
|
|
self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
|
|
|
|
self.probs /= self.probs.sum()
|
|
|
|
self._cache = []
|
|
|
|
|
|
|
|
def next(self):
|
|
|
|
if not self._cache:
|
|
|
|
self._cache.extend(
|
|
|
|
numpy.random.choice(len(self.words), 10000, p=self.probs)
|
|
|
|
)
|
|
|
|
index = self._cache.pop()
|
|
|
|
return self.words[index]
|
|
|
|
|
|
|
|
|
|
|
|
def _apply_mask(docs, random_words, mask_prob=0.15):
|
|
|
|
# This needs to be here to avoid circular imports
|
|
|
|
from .tokens.doc import Doc
|
|
|
|
|
|
|
|
N = sum(len(doc) for doc in docs)
|
|
|
|
mask = numpy.random.uniform(0.0, 1.0, (N,))
|
|
|
|
mask = mask >= mask_prob
|
|
|
|
i = 0
|
|
|
|
masked_docs = []
|
|
|
|
for doc in docs:
|
|
|
|
words = []
|
|
|
|
for token in doc:
|
|
|
|
if not mask[i]:
|
|
|
|
word = _replace_word(token.text, random_words)
|
|
|
|
else:
|
|
|
|
word = token.text
|
|
|
|
words.append(word)
|
|
|
|
i += 1
|
|
|
|
spaces = [bool(w.whitespace_) for w in doc]
|
|
|
|
# NB: If you change this implementation to instead modify
|
|
|
|
# the docs in place, take care that the IDs reflect the original
|
|
|
|
# words. Currently we use the original docs to make the vectors
|
|
|
|
# for the target, so we don't lose the original tokens. But if
|
|
|
|
# you modified the docs in place here, you would.
|
|
|
|
masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces))
|
|
|
|
return mask, masked_docs
|
|
|
|
|
|
|
|
|
|
|
|
def _replace_word(word, random_words, mask="[MASK]"):
|
|
|
|
roll = numpy.random.random()
|
|
|
|
if roll < 0.8:
|
|
|
|
return mask
|
|
|
|
elif roll < 0.9:
|
|
|
|
return random_words.next()
|
|
|
|
else:
|
|
|
|
return word
|
2019-03-09 14:50:08 +03:00
|
|
|
|
|
|
|
|
|
|
|
def _uniform_init(lo, hi):
|
|
|
|
def wrapped(W, ops):
|
|
|
|
copy_array(W, ops.xp.random.uniform(lo, hi, W.shape))
|
2019-09-11 15:00:36 +03:00
|
|
|
|
2019-03-09 14:50:08 +03:00
|
|
|
return wrapped
|
|
|
|
|
|
|
|
|
|
|
|
@describe.attributes(
|
|
|
|
nM=Dimension("Vector dimensions"),
|
|
|
|
nC=Dimension("Number of characters per word"),
|
2019-09-11 15:00:36 +03:00
|
|
|
vectors=Synapses(
|
|
|
|
"Embed matrix", lambda obj: (obj.nC, obj.nV, obj.nM), _uniform_init(-0.1, 0.1)
|
|
|
|
),
|
|
|
|
d_vectors=Gradient("vectors"),
|
2019-03-09 14:50:08 +03:00
|
|
|
)
|
|
|
|
class CharacterEmbed(Model):
|
|
|
|
def __init__(self, nM=None, nC=None, **kwargs):
|
|
|
|
Model.__init__(self, **kwargs)
|
|
|
|
self.nM = nM
|
|
|
|
self.nC = nC
|
|
|
|
|
|
|
|
@property
|
|
|
|
def nO(self):
|
|
|
|
return self.nM * self.nC
|
2019-09-11 15:00:36 +03:00
|
|
|
|
2019-03-09 14:50:08 +03:00
|
|
|
@property
|
|
|
|
def nV(self):
|
|
|
|
return 256
|
|
|
|
|
2019-09-11 15:00:36 +03:00
|
|
|
def begin_update(self, docs, drop=0.0):
|
2019-03-09 14:50:08 +03:00
|
|
|
if not docs:
|
|
|
|
return []
|
|
|
|
ids = []
|
|
|
|
output = []
|
|
|
|
weights = self.vectors
|
|
|
|
# This assists in indexing; it's like looping over this dimension.
|
|
|
|
# Still consider this weird witch craft...But thanks to Mark Neumann
|
|
|
|
# for the tip.
|
|
|
|
nCv = self.ops.xp.arange(self.nC)
|
|
|
|
for doc in docs:
|
|
|
|
doc_ids = doc.to_utf8_array(nr_char=self.nC)
|
|
|
|
doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM))
|
|
|
|
# Let's say I have a 2d array of indices, and a 3d table of data. What numpy
|
|
|
|
# incantation do I chant to get
|
|
|
|
# output[i, j, k] == data[j, ids[i, j], k]?
|
|
|
|
doc_vectors[:, nCv] = weights[nCv, doc_ids[:, nCv]]
|
|
|
|
output.append(doc_vectors.reshape((len(doc), self.nO)))
|
|
|
|
ids.append(doc_ids)
|
|
|
|
|
|
|
|
def backprop_character_embed(d_vectors, sgd=None):
|
|
|
|
gradient = self.d_vectors
|
|
|
|
for doc_ids, d_doc_vectors in zip(ids, d_vectors):
|
|
|
|
d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), self.nC, self.nM))
|
|
|
|
gradient[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv]
|
|
|
|
if sgd is not None:
|
|
|
|
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
|
|
|
return None
|
2019-09-11 15:00:36 +03:00
|
|
|
|
2019-03-09 14:50:08 +03:00
|
|
|
return output, backprop_character_embed
|
|
|
|
|
|
|
|
|
2019-10-08 00:34:58 +03:00
|
|
|
def get_cossim_loss(yh, y, ignore_zeros=False):
|
|
|
|
xp = get_array_module(yh)
|
|
|
|
# Find the zero vectors
|
|
|
|
if ignore_zeros:
|
|
|
|
zero_indices = xp.abs(y).sum(axis=1) == 0
|
2019-08-20 16:08:59 +03:00
|
|
|
# Add a small constant to avoid 0 vectors
|
|
|
|
yh = yh + 1e-8
|
|
|
|
y = y + 1e-8
|
|
|
|
# https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
|
|
|
|
norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
|
|
|
|
norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
|
|
|
|
mul_norms = norm_yh * norm_y
|
|
|
|
cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
|
|
|
|
d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2))
|
2019-10-08 00:34:58 +03:00
|
|
|
losses = xp.abs(cosine - 1)
|
|
|
|
if ignore_zeros:
|
|
|
|
# If the target was a zero vector, don't count it in the loss.
|
|
|
|
d_yh[zero_indices] = 0
|
|
|
|
losses[zero_indices] = 0
|
|
|
|
loss = losses.sum()
|
2019-09-11 15:00:36 +03:00
|
|
|
return loss, -d_yh
|