mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge branch 'develop' into feature/refactor-config-args
This commit is contained in:
commit
7b5717cac3
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a3"
|
__version__ = "3.0.0a4"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -245,6 +245,13 @@ class ParserStepModel(Model):
|
||||||
for class_ in unseen_classes:
|
for class_ in unseen_classes:
|
||||||
self._class_mask[class_] = 0.
|
self._class_mask[class_] = 0.
|
||||||
|
|
||||||
|
def clear_memory(self):
|
||||||
|
del self.tokvecs
|
||||||
|
del self.bp_tokvecs
|
||||||
|
del self.state2vec
|
||||||
|
del self.backprops
|
||||||
|
del self._class_mask
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def nO(self):
|
def nO(self):
|
||||||
if self.attrs["has_upper"]:
|
if self.attrs["has_upper"]:
|
||||||
|
@ -273,6 +280,19 @@ class ParserStepModel(Model):
|
||||||
c_ids += ids.shape[1]
|
c_ids += ids.shape[1]
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
|
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
|
||||||
|
if isinstance(self.state2vec.ops, CupyOps) \
|
||||||
|
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
|
||||||
|
# Move token_ids and d_vector to GPU, asynchronously
|
||||||
|
self.backprops.append((
|
||||||
|
util.get_async(self.cuda_stream, token_ids),
|
||||||
|
util.get_async(self.cuda_stream, d_vector),
|
||||||
|
get_d_tokvecs
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
|
||||||
|
|
||||||
|
|
||||||
def finish_steps(self, golds):
|
def finish_steps(self, golds):
|
||||||
# Add a padding vector to the d_tokvecs gradient, so that missing
|
# Add a padding vector to the d_tokvecs gradient, so that missing
|
||||||
# values don't affect the real gradient.
|
# values don't affect the real gradient.
|
||||||
|
@ -315,16 +335,7 @@ def step_forward(model: ParserStepModel, states, is_train):
|
||||||
d_vector = get_d_vector(d_scores)
|
d_vector = get_d_vector(d_scores)
|
||||||
if mask is not None:
|
if mask is not None:
|
||||||
d_vector *= mask
|
d_vector *= mask
|
||||||
if isinstance(model.state2vec.ops, CupyOps) \
|
model.backprop_step(token_ids, d_vector, get_d_tokvecs)
|
||||||
and not isinstance(token_ids, model.state2vec.ops.xp.ndarray):
|
|
||||||
# Move token_ids and d_vector to GPU, asynchronously
|
|
||||||
model.backprops.append((
|
|
||||||
util.get_async(model.cuda_stream, token_ids),
|
|
||||||
util.get_async(model.cuda_stream, d_vector),
|
|
||||||
get_d_tokvecs
|
|
||||||
))
|
|
||||||
else:
|
|
||||||
model.backprops.append((token_ids, d_vector, get_d_tokvecs))
|
|
||||||
return None
|
return None
|
||||||
return scores, backprop_parser_step
|
return scores, backprop_parser_step
|
||||||
|
|
||||||
|
|
|
@ -200,6 +200,8 @@ cdef class Parser:
|
||||||
with nogil:
|
with nogil:
|
||||||
self._parseC(&states[0],
|
self._parseC(&states[0],
|
||||||
weights, sizes)
|
weights, sizes)
|
||||||
|
model.clear_memory()
|
||||||
|
del model
|
||||||
return batch
|
return batch
|
||||||
|
|
||||||
cdef void _parseC(self, StateC** states,
|
cdef void _parseC(self, StateC** states,
|
||||||
|
@ -312,6 +314,13 @@ cdef class Parser:
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
self.set_annotations(docs, all_states)
|
self.set_annotations(docs, all_states)
|
||||||
|
# Ugh, this is annoying. If we're working on GPU, we want to free the
|
||||||
|
# memory ASAP. It seems that Python doesn't necessarily get around to
|
||||||
|
# removing these in time if we don't explicitly delete? It's confusing.
|
||||||
|
del backprop
|
||||||
|
del backprop_tok2vec
|
||||||
|
model.clear_memory()
|
||||||
|
del model
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def rehearse(self, examples, sgd=None, losses=None, **cfg):
|
def rehearse(self, examples, sgd=None, losses=None, **cfg):
|
||||||
|
@ -335,7 +344,7 @@ cdef class Parser:
|
||||||
set_dropout_rate(self._rehearsal_model, 0.0)
|
set_dropout_rate(self._rehearsal_model, 0.0)
|
||||||
set_dropout_rate(self.model, 0.0)
|
set_dropout_rate(self.model, 0.0)
|
||||||
tutor, _ = self._rehearsal_model.begin_update(docs)
|
tutor, _ = self._rehearsal_model.begin_update(docs)
|
||||||
model, finish_update = self.model.begin_update(docs)
|
model, backprop_tok2vec = self.model.begin_update(docs)
|
||||||
n_scores = 0.
|
n_scores = 0.
|
||||||
loss = 0.
|
loss = 0.
|
||||||
while states:
|
while states:
|
||||||
|
@ -351,10 +360,16 @@ cdef class Parser:
|
||||||
states = [state for state in states if not state.is_final()]
|
states = [state for state in states if not state.is_final()]
|
||||||
n_scores += d_scores.size
|
n_scores += d_scores.size
|
||||||
# Do the backprop
|
# Do the backprop
|
||||||
finish_update(docs)
|
backprop_tok2vec(docs)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.model.finish_update(sgd)
|
self.model.finish_update(sgd)
|
||||||
losses[self.name] += loss / n_scores
|
losses[self.name] += loss / n_scores
|
||||||
|
del backprop
|
||||||
|
del backprop_tok2vec
|
||||||
|
model.clear_memory()
|
||||||
|
tutor.clear_memory()
|
||||||
|
del model
|
||||||
|
del tutor
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def get_gradients(self):
|
def get_gradients(self):
|
||||||
|
|
|
@ -488,7 +488,8 @@ data for machine learning models, developed by us. It integrates with spaCy
|
||||||
out-of-the-box and provides many different
|
out-of-the-box and provides many different
|
||||||
[annotation recipes](https://prodi.gy/docs/recipes) for a variety of NLP tasks,
|
[annotation recipes](https://prodi.gy/docs/recipes) for a variety of NLP tasks,
|
||||||
with and without a model in the loop. If Prodigy is installed in your project,
|
with and without a model in the loop. If Prodigy is installed in your project,
|
||||||
you can
|
you can start the annotation server from your `project.yml` for a tight feedback
|
||||||
|
loop between data development and training.
|
||||||
|
|
||||||
The following example command starts the Prodigy app using the
|
The following example command starts the Prodigy app using the
|
||||||
[`ner.correct`](https://prodi.gy/docs/recipes#ner-correct) recipe and streams in
|
[`ner.correct`](https://prodi.gy/docs/recipes#ner-correct) recipe and streams in
|
||||||
|
@ -497,6 +498,12 @@ then correct the suggestions manually in the UI. After you save and exit the
|
||||||
server, the full dataset is exported in spaCy's format and split into a training
|
server, the full dataset is exported in spaCy's format and split into a training
|
||||||
and evaluation set.
|
and evaluation set.
|
||||||
|
|
||||||
|
> #### Example usage
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> $ python -m spacy project run annotate
|
||||||
|
> ```
|
||||||
|
|
||||||
<!-- prettier-ignore -->
|
<!-- prettier-ignore -->
|
||||||
```yaml
|
```yaml
|
||||||
### project.yml
|
### project.yml
|
||||||
|
@ -509,7 +516,9 @@ commands:
|
||||||
- name: annotate
|
- name: annotate
|
||||||
- script:
|
- script:
|
||||||
- 'python -m prodigy ner.correct {PRODIGY_DATASET} ./assets/raw_data.jsonl {PRODIGY_MODEL} --labels {PRODIGY_LABELS}'
|
- 'python -m prodigy ner.correct {PRODIGY_DATASET} ./assets/raw_data.jsonl {PRODIGY_MODEL} --labels {PRODIGY_LABELS}'
|
||||||
- 'python -m prodigy data-to-spacy ./corpus/train.spacy ./corpus/eval.spacy --ner {PRODIGY_DATASET}'
|
- 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner {PRODIGY_DATASET}'
|
||||||
|
- 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy'
|
||||||
|
- 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy'
|
||||||
- deps:
|
- deps:
|
||||||
- 'assets/raw_data.jsonl'
|
- 'assets/raw_data.jsonl'
|
||||||
- outputs:
|
- outputs:
|
||||||
|
@ -517,6 +526,15 @@ commands:
|
||||||
- 'corpus/eval.spacy'
|
- 'corpus/eval.spacy'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can use the same approach for other types of projects and annotation
|
||||||
|
workflows, including
|
||||||
|
[text classification](https://prodi.gy/docs/recipes#textcat),
|
||||||
|
[dependency parsing](https://prodi.gy/docs/recipes#dep),
|
||||||
|
[part-of-speech tagging](https://prodi.gy/docs/recipes#pos) or fully
|
||||||
|
[custom recipes](https://prodi.gy/docs/custom-recipes) – for instance, an A/B
|
||||||
|
evaluation workflow that lets you compare two different models and their
|
||||||
|
results.
|
||||||
|
|
||||||
<Project id="integrations/prodigy">
|
<Project id="integrations/prodigy">
|
||||||
|
|
||||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
|
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
|
||||||
|
@ -567,6 +585,12 @@ MODELS = [name.strip() for name in sys.argv[1].split(",")]
|
||||||
spacy_streamlit.visualize(MODELS, DEFAULT_TEXT, visualizers=["ner"])
|
spacy_streamlit.visualize(MODELS, DEFAULT_TEXT, visualizers=["ner"])
|
||||||
```
|
```
|
||||||
|
|
||||||
|
> #### Example usage
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> $ python -m spacy project run visualize
|
||||||
|
> ```
|
||||||
|
|
||||||
<!-- prettier-ignore -->
|
<!-- prettier-ignore -->
|
||||||
```yaml
|
```yaml
|
||||||
### project.yml
|
### project.yml
|
||||||
|
@ -591,7 +615,33 @@ mattis pretium.
|
||||||
|
|
||||||
### FastAPI {#fastapi} <IntegrationLogo name="fastapi" width={100} height="auto" align="right" />
|
### FastAPI {#fastapi} <IntegrationLogo name="fastapi" width={100} height="auto" align="right" />
|
||||||
|
|
||||||
<!-- TODO: come up with example – there's not much integration needed, but it'd be nice to show an example that addresses some of the main concerns for serving ML (workers etc.) -->
|
[FastAPI](https://fastapi.tiangolo.com/) is a modern high-performance framework
|
||||||
|
for building REST APIs with Python, based on Python
|
||||||
|
[type hints](https://fastapi.tiangolo.com/python-types/). It's become a popular
|
||||||
|
library for serving machine learning models and
|
||||||
|
|
||||||
|
```python
|
||||||
|
# TODO: show an example that addresses some of the main concerns for serving ML (workers etc.)
|
||||||
|
```
|
||||||
|
|
||||||
|
> #### Example usage
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> $ python -m spacy project run visualize
|
||||||
|
> ```
|
||||||
|
|
||||||
|
<!-- prettier-ignore -->
|
||||||
|
```yaml
|
||||||
|
### project.yml
|
||||||
|
commands:
|
||||||
|
- name: serve
|
||||||
|
help: "Serve the trained model with FastAPI"
|
||||||
|
script:
|
||||||
|
- 'python ./scripts/serve.py ./training/model-best'
|
||||||
|
deps:
|
||||||
|
- 'training/model-best'
|
||||||
|
no_skip: true
|
||||||
|
```
|
||||||
|
|
||||||
<Project id="integrations/fastapi">
|
<Project id="integrations/fastapi">
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user