Merge branch 'develop' into feature/refactor-config-args

This commit is contained in:
Ines Montani 2020-07-10 22:50:07 +02:00
commit 7b5717cac3
4 changed files with 92 additions and 16 deletions

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a3" __version__ = "3.0.0a4"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -245,6 +245,13 @@ class ParserStepModel(Model):
for class_ in unseen_classes: for class_ in unseen_classes:
self._class_mask[class_] = 0. self._class_mask[class_] = 0.
def clear_memory(self):
del self.tokvecs
del self.bp_tokvecs
del self.state2vec
del self.backprops
del self._class_mask
@property @property
def nO(self): def nO(self):
if self.attrs["has_upper"]: if self.attrs["has_upper"]:
@ -273,6 +280,19 @@ class ParserStepModel(Model):
c_ids += ids.shape[1] c_ids += ids.shape[1]
return ids return ids
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
if isinstance(self.state2vec.ops, CupyOps) \
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
# Move token_ids and d_vector to GPU, asynchronously
self.backprops.append((
util.get_async(self.cuda_stream, token_ids),
util.get_async(self.cuda_stream, d_vector),
get_d_tokvecs
))
else:
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
def finish_steps(self, golds): def finish_steps(self, golds):
# Add a padding vector to the d_tokvecs gradient, so that missing # Add a padding vector to the d_tokvecs gradient, so that missing
# values don't affect the real gradient. # values don't affect the real gradient.
@ -315,16 +335,7 @@ def step_forward(model: ParserStepModel, states, is_train):
d_vector = get_d_vector(d_scores) d_vector = get_d_vector(d_scores)
if mask is not None: if mask is not None:
d_vector *= mask d_vector *= mask
if isinstance(model.state2vec.ops, CupyOps) \ model.backprop_step(token_ids, d_vector, get_d_tokvecs)
and not isinstance(token_ids, model.state2vec.ops.xp.ndarray):
# Move token_ids and d_vector to GPU, asynchronously
model.backprops.append((
util.get_async(model.cuda_stream, token_ids),
util.get_async(model.cuda_stream, d_vector),
get_d_tokvecs
))
else:
model.backprops.append((token_ids, d_vector, get_d_tokvecs))
return None return None
return scores, backprop_parser_step return scores, backprop_parser_step

View File

@ -200,6 +200,8 @@ cdef class Parser:
with nogil: with nogil:
self._parseC(&states[0], self._parseC(&states[0],
weights, sizes) weights, sizes)
model.clear_memory()
del model
return batch return batch
cdef void _parseC(self, StateC** states, cdef void _parseC(self, StateC** states,
@ -312,6 +314,13 @@ cdef class Parser:
if set_annotations: if set_annotations:
docs = [eg.predicted for eg in examples] docs = [eg.predicted for eg in examples]
self.set_annotations(docs, all_states) self.set_annotations(docs, all_states)
# Ugh, this is annoying. If we're working on GPU, we want to free the
# memory ASAP. It seems that Python doesn't necessarily get around to
# removing these in time if we don't explicitly delete? It's confusing.
del backprop
del backprop_tok2vec
model.clear_memory()
del model
return losses return losses
def rehearse(self, examples, sgd=None, losses=None, **cfg): def rehearse(self, examples, sgd=None, losses=None, **cfg):
@ -335,7 +344,7 @@ cdef class Parser:
set_dropout_rate(self._rehearsal_model, 0.0) set_dropout_rate(self._rehearsal_model, 0.0)
set_dropout_rate(self.model, 0.0) set_dropout_rate(self.model, 0.0)
tutor, _ = self._rehearsal_model.begin_update(docs) tutor, _ = self._rehearsal_model.begin_update(docs)
model, finish_update = self.model.begin_update(docs) model, backprop_tok2vec = self.model.begin_update(docs)
n_scores = 0. n_scores = 0.
loss = 0. loss = 0.
while states: while states:
@ -351,10 +360,16 @@ cdef class Parser:
states = [state for state in states if not state.is_final()] states = [state for state in states if not state.is_final()]
n_scores += d_scores.size n_scores += d_scores.size
# Do the backprop # Do the backprop
finish_update(docs) backprop_tok2vec(docs)
if sgd is not None: if sgd is not None:
self.model.finish_update(sgd) self.model.finish_update(sgd)
losses[self.name] += loss / n_scores losses[self.name] += loss / n_scores
del backprop
del backprop_tok2vec
model.clear_memory()
tutor.clear_memory()
del model
del tutor
return losses return losses
def get_gradients(self): def get_gradients(self):

View File

@ -488,7 +488,8 @@ data for machine learning models, developed by us. It integrates with spaCy
out-of-the-box and provides many different out-of-the-box and provides many different
[annotation recipes](https://prodi.gy/docs/recipes) for a variety of NLP tasks, [annotation recipes](https://prodi.gy/docs/recipes) for a variety of NLP tasks,
with and without a model in the loop. If Prodigy is installed in your project, with and without a model in the loop. If Prodigy is installed in your project,
you can you can start the annotation server from your `project.yml` for a tight feedback
loop between data development and training.
The following example command starts the Prodigy app using the The following example command starts the Prodigy app using the
[`ner.correct`](https://prodi.gy/docs/recipes#ner-correct) recipe and streams in [`ner.correct`](https://prodi.gy/docs/recipes#ner-correct) recipe and streams in
@ -497,6 +498,12 @@ then correct the suggestions manually in the UI. After you save and exit the
server, the full dataset is exported in spaCy's format and split into a training server, the full dataset is exported in spaCy's format and split into a training
and evaluation set. and evaluation set.
> #### Example usage
>
> ```bash
> $ python -m spacy project run annotate
> ```
<!-- prettier-ignore --> <!-- prettier-ignore -->
```yaml ```yaml
### project.yml ### project.yml
@ -509,7 +516,9 @@ commands:
- name: annotate - name: annotate
- script: - script:
- 'python -m prodigy ner.correct {PRODIGY_DATASET} ./assets/raw_data.jsonl {PRODIGY_MODEL} --labels {PRODIGY_LABELS}' - 'python -m prodigy ner.correct {PRODIGY_DATASET} ./assets/raw_data.jsonl {PRODIGY_MODEL} --labels {PRODIGY_LABELS}'
- 'python -m prodigy data-to-spacy ./corpus/train.spacy ./corpus/eval.spacy --ner {PRODIGY_DATASET}' - 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner {PRODIGY_DATASET}'
- 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy'
- 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy'
- deps: - deps:
- 'assets/raw_data.jsonl' - 'assets/raw_data.jsonl'
- outputs: - outputs:
@ -517,6 +526,15 @@ commands:
- 'corpus/eval.spacy' - 'corpus/eval.spacy'
``` ```
You can use the same approach for other types of projects and annotation
workflows, including
[text classification](https://prodi.gy/docs/recipes#textcat),
[dependency parsing](https://prodi.gy/docs/recipes#dep),
[part-of-speech tagging](https://prodi.gy/docs/recipes#pos) or fully
[custom recipes](https://prodi.gy/docs/custom-recipes) for instance, an A/B
evaluation workflow that lets you compare two different models and their
results.
<Project id="integrations/prodigy"> <Project id="integrations/prodigy">
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
@ -567,6 +585,12 @@ MODELS = [name.strip() for name in sys.argv[1].split(",")]
spacy_streamlit.visualize(MODELS, DEFAULT_TEXT, visualizers=["ner"]) spacy_streamlit.visualize(MODELS, DEFAULT_TEXT, visualizers=["ner"])
``` ```
> #### Example usage
>
> ```bash
> $ python -m spacy project run visualize
> ```
<!-- prettier-ignore --> <!-- prettier-ignore -->
```yaml ```yaml
### project.yml ### project.yml
@ -591,7 +615,33 @@ mattis pretium.
### FastAPI {#fastapi} <IntegrationLogo name="fastapi" width={100} height="auto" align="right" /> ### FastAPI {#fastapi} <IntegrationLogo name="fastapi" width={100} height="auto" align="right" />
<!-- TODO: come up with example there's not much integration needed, but it'd be nice to show an example that addresses some of the main concerns for serving ML (workers etc.) --> [FastAPI](https://fastapi.tiangolo.com/) is a modern high-performance framework
for building REST APIs with Python, based on Python
[type hints](https://fastapi.tiangolo.com/python-types/). It's become a popular
library for serving machine learning models and
```python
# TODO: show an example that addresses some of the main concerns for serving ML (workers etc.)
```
> #### Example usage
>
> ```bash
> $ python -m spacy project run visualize
> ```
<!-- prettier-ignore -->
```yaml
### project.yml
commands:
- name: serve
help: "Serve the trained model with FastAPI"
script:
- 'python ./scripts/serve.py ./training/model-best'
deps:
- 'training/model-best'
no_skip: true
```
<Project id="integrations/fastapi"> <Project id="integrations/fastapi">