mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge remote-tracking branch 'upstream/develop' into feature/trf-docs
This commit is contained in:
commit
636be3c791
|
@ -86,6 +86,8 @@ cuda101 =
|
|||
cupy-cuda101>=5.0.0b4,<9.0.0
|
||||
cuda102 =
|
||||
cupy-cuda102>=5.0.0b4,<9.0.0
|
||||
cuda110 =
|
||||
cupy-cuda110>=5.0.0b4,<9.0.0
|
||||
# Language tokenizers with external dependencies
|
||||
ja =
|
||||
sudachipy>=0.4.9
|
||||
|
@ -94,8 +96,6 @@ ko =
|
|||
natto-py==0.9.0
|
||||
th =
|
||||
pythainlp>=2.0
|
||||
zh =
|
||||
spacy-pkuseg==0.0.26
|
||||
|
||||
[bdist_wheel]
|
||||
universal = false
|
||||
|
|
|
@ -143,6 +143,9 @@ nO = null
|
|||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.textcat.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
|
||||
[components.textcat.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
|
|
|
@ -17,7 +17,7 @@ from ... import util
|
|||
|
||||
|
||||
# fmt: off
|
||||
_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install spacy-pkuseg==0.0.26`"
|
||||
_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install \"spacy-pkuseg>=0.0.27,<0.1.0\"` or `conda install -c conda-forge \"spacy-pkuseg>=0.0.27,<0.1.0\"`"
|
||||
# fmt: on
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
|
|
@ -61,14 +61,14 @@ def build_bow_text_classifier(
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.TextCatEnsemble.v2")
|
||||
def build_text_classifier(
|
||||
def build_text_classifier_v2(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
linear_model: Model[List[Doc], Floats2d],
|
||||
nO: Optional[int] = None,
|
||||
) -> Model[List[Doc], Floats2d]:
|
||||
exclusive_classes = not linear_model.attrs["multi_label"]
|
||||
with Model.define_operators({">>": chain, "|": concatenate}):
|
||||
width = tok2vec.get_dim("nO")
|
||||
width = tok2vec.maybe_get_dim("nO")
|
||||
cnn_model = (
|
||||
tok2vec
|
||||
>> list2ragged()
|
||||
|
|
|
@ -92,9 +92,6 @@ class Morphologizer(Tagger):
|
|||
# 2) labels_pos stores a mapping from morph+POS->POS
|
||||
cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}}
|
||||
self.cfg = dict(sorted(cfg.items()))
|
||||
# add mappings for empty morph
|
||||
self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
|
||||
self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -201,8 +198,8 @@ class Morphologizer(Tagger):
|
|||
doc_tag_ids = doc_tag_ids.get()
|
||||
for j, tag_id in enumerate(doc_tag_ids):
|
||||
morph = self.labels[tag_id]
|
||||
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
|
||||
doc.c[j].pos = self.cfg["labels_pos"][morph]
|
||||
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
|
||||
doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
"""Find the loss and gradient of loss for the batch of documents and
|
||||
|
@ -228,12 +225,12 @@ class Morphologizer(Tagger):
|
|||
# doesn't, so if either is None, treat both as None here so that
|
||||
# truths doesn't end up with an unknown morph+POS combination
|
||||
if pos is None or morph is None:
|
||||
pos = None
|
||||
morph = None
|
||||
label_dict = Morphology.feats_to_dict(morph)
|
||||
if pos:
|
||||
label_dict[self.POS_FEAT] = pos
|
||||
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
||||
label = None
|
||||
else:
|
||||
label_dict = Morphology.feats_to_dict(morph)
|
||||
if pos:
|
||||
label_dict[self.POS_FEAT] = pos
|
||||
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
||||
eg_truths.append(label)
|
||||
truths.append(eg_truths)
|
||||
d_scores, loss = loss_func(scores, truths)
|
||||
|
|
|
@ -512,7 +512,7 @@ class Scorer:
|
|||
negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL")
|
||||
RETURNS (Dict[str, Any]): A dictionary containing the scores.
|
||||
|
||||
DOCS (TODO): https://nightly.spacy.io/api/scorer#score_links
|
||||
DOCS: https://nightly.spacy.io/api/scorer#score_links
|
||||
"""
|
||||
f_per_type = {}
|
||||
for example in examples:
|
||||
|
|
|
@ -116,3 +116,23 @@ def test_overfitting_IO():
|
|||
no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]]
|
||||
assert_equal(batch_deps_1, batch_deps_2)
|
||||
assert_equal(batch_deps_1, no_batch_deps)
|
||||
|
||||
# Test without POS
|
||||
nlp.remove_pipe("morphologizer")
|
||||
nlp.add_pipe("morphologizer")
|
||||
for example in train_examples:
|
||||
for token in example.reference:
|
||||
token.pos_ = ""
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
for i in range(50):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["morphologizer"] < 0.00001
|
||||
|
||||
# Test the trained model
|
||||
test_text = "I like blue ham"
|
||||
doc = nlp(test_text)
|
||||
gold_morphs = ["Feat=N", "Feat=V", "", ""]
|
||||
gold_pos_tags = ["", "", "", ""]
|
||||
assert [str(t.morph) for t in doc] == gold_morphs
|
||||
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||
|
|
|
@ -61,11 +61,11 @@ on the transformer architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
|
||||
| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||
| Setting | Description |
|
||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
|
||||
| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
|
||||
|
|
|
@ -174,6 +174,7 @@ $ source .env/bin/activate # activate virtual env
|
|||
$ export PYTHONPATH=`pwd` # set Python path to spaCy dir
|
||||
$ pip install -r requirements.txt # install all requirements
|
||||
$ python setup.py build_ext --inplace # compile spaCy
|
||||
$ python setup.py install # install spaCy
|
||||
```
|
||||
|
||||
Compared to regular install via pip, the
|
||||
|
|
|
@ -843,6 +843,27 @@ def __call__(self, Doc doc):
|
|||
return doc
|
||||
```
|
||||
|
||||
There is one more optional method to implement: [`score`](/api/pipe#score)
|
||||
calculates the performance of your component on a set of examples, and
|
||||
returns the results as a dictionary:
|
||||
|
||||
```python
|
||||
### The score method
|
||||
def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
|
||||
prf = PRFScore()
|
||||
for example in examples:
|
||||
...
|
||||
|
||||
return {
|
||||
"rel_micro_p": prf.precision,
|
||||
"rel_micro_r": prf.recall,
|
||||
"rel_micro_f": prf.fscore,
|
||||
}
|
||||
```
|
||||
|
||||
This is particularly useful to see the scores on the development corpus
|
||||
when training the component with [`spacy train`](/api/cli#training).
|
||||
|
||||
Once our `TrainablePipe` subclass is fully implemented, we can
|
||||
[register](/usage/processing-pipelines#custom-components-factories) the
|
||||
component with the [`@Language.factory`](/api/language#factory) decorator. This
|
||||
|
@ -865,6 +886,11 @@ assigns it a name and lets you create the component with
|
|||
> [components.relation_extractor.model.get_candidates]
|
||||
> @misc = "rel_cand_generator.v1"
|
||||
> max_length = 20
|
||||
>
|
||||
> [training.score_weights]
|
||||
> rel_micro_p = 0.0
|
||||
> rel_micro_r = 0.0
|
||||
> rel_micro_f = 1.0
|
||||
> ```
|
||||
|
||||
```python
|
||||
|
@ -876,6 +902,28 @@ def make_relation_extractor(nlp, name, model):
|
|||
return RelationExtractor(nlp.vocab, model, name)
|
||||
```
|
||||
|
||||
You can extend the decorator to include information such as the type of
|
||||
annotations that are required for this component to run, the type of annotations
|
||||
it produces, and the scores that can be calculated:
|
||||
|
||||
```python
|
||||
### Factory annotations {highlight="5-11"}
|
||||
from spacy.language import Language
|
||||
|
||||
@Language.factory(
|
||||
"relation_extractor",
|
||||
requires=["doc.ents", "token.ent_iob", "token.ent_type"],
|
||||
assigns=["doc._.rel"],
|
||||
default_score_weights={
|
||||
"rel_micro_p": None,
|
||||
"rel_micro_r": None,
|
||||
"rel_micro_f": None,
|
||||
},
|
||||
)
|
||||
def make_relation_extractor(nlp, name, model):
|
||||
return RelationExtractor(nlp.vocab, model, name)
|
||||
```
|
||||
|
||||
<!-- TODO: <Project id="tutorials/ner-relations">
|
||||
|
||||
</Project> -->
|
||||
|
|
|
@ -969,7 +969,7 @@ import spacy
|
|||
from spacy.tokens import Doc, DocBin
|
||||
|
||||
nlp = spacy.blank("en")
|
||||
docbin = DocBin(nlp.vocab)
|
||||
docbin = DocBin()
|
||||
words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
|
||||
spaces = [True, True, True, True, True, True, True, False]
|
||||
ents = ["B-ORG", "O", "O", "O", "O", "B-GPE", "O", "O"]
|
||||
|
|
|
@ -7,7 +7,7 @@ import { repo } from '../components/util'
|
|||
const DEFAULT_MODELS = ['en']
|
||||
const DEFAULT_OPT = 'efficiency'
|
||||
const DEFAULT_HARDWARE = 'cpu'
|
||||
const DEFAULT_CUDA = 'cuda100'
|
||||
const DEFAULT_CUDA = 'cuda102'
|
||||
const CUDA = {
|
||||
'8.0': 'cuda80',
|
||||
'9.0': 'cuda90',
|
||||
|
@ -16,56 +16,9 @@ const CUDA = {
|
|||
'10.0': 'cuda100',
|
||||
'10.1': 'cuda101',
|
||||
'10.2': 'cuda102',
|
||||
'11.0': 'cuda110',
|
||||
}
|
||||
const LANG_EXTRAS = ['zh', 'ja'] // only for languages with models
|
||||
const DATA = [
|
||||
{
|
||||
id: 'os',
|
||||
title: 'Operating system',
|
||||
options: [
|
||||
{ id: 'mac', title: 'macOS / OSX', checked: true },
|
||||
{ id: 'windows', title: 'Windows' },
|
||||
{ id: 'linux', title: 'Linux' },
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'package',
|
||||
title: 'Package manager',
|
||||
options: [
|
||||
{ id: 'pip', title: 'pip', checked: true },
|
||||
{ id: 'conda', title: 'conda' },
|
||||
{ id: 'source', title: 'from source' },
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'hardware',
|
||||
title: 'Hardware',
|
||||
options: [
|
||||
{ id: 'cpu', title: 'CPU', checked: DEFAULT_HARDWARE === 'cpu' },
|
||||
{ id: 'gpu', title: 'GPU', checked: DEFAULT_HARDWARE == 'gpu' },
|
||||
],
|
||||
dropdown: Object.keys(CUDA).map(id => ({ id: CUDA[id], title: `CUDA ${id}` })),
|
||||
defaultValue: DEFAULT_CUDA,
|
||||
},
|
||||
{
|
||||
id: 'config',
|
||||
title: 'Configuration',
|
||||
multiple: true,
|
||||
options: [
|
||||
{
|
||||
id: 'venv',
|
||||
title: 'virtual env',
|
||||
help: 'Use a virtual environment and install spaCy into a user directory',
|
||||
},
|
||||
{
|
||||
id: 'train',
|
||||
title: 'train models',
|
||||
help:
|
||||
'Check this if you plan to train your own models with spaCy to install extra dependencies and data resources',
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
const LANG_EXTRAS = ['ja'] // only for languages with models
|
||||
|
||||
const QuickstartInstall = ({ id, title }) => {
|
||||
const [train, setTrain] = useState(false)
|
||||
|
@ -99,7 +52,56 @@ const QuickstartInstall = ({ id, title }) => {
|
|||
const pkg = nightly ? 'spacy-nightly' : 'spacy'
|
||||
const models = languages.filter(({ models }) => models !== null)
|
||||
const data = [
|
||||
...DATA,
|
||||
{
|
||||
id: 'os',
|
||||
title: 'Operating system',
|
||||
options: [
|
||||
{ id: 'mac', title: 'macOS / OSX', checked: true },
|
||||
{ id: 'windows', title: 'Windows' },
|
||||
{ id: 'linux', title: 'Linux' },
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'package',
|
||||
title: 'Package manager',
|
||||
options: [
|
||||
{ id: 'pip', title: 'pip', checked: true },
|
||||
!nightly ? { id: 'conda', title: 'conda' } : null,
|
||||
{ id: 'source', title: 'from source' },
|
||||
].filter(o => o),
|
||||
},
|
||||
{
|
||||
id: 'hardware',
|
||||
title: 'Hardware',
|
||||
options: [
|
||||
{ id: 'cpu', title: 'CPU', checked: DEFAULT_HARDWARE === 'cpu' },
|
||||
{ id: 'gpu', title: 'GPU', checked: DEFAULT_HARDWARE == 'gpu' },
|
||||
],
|
||||
dropdown: Object.keys(CUDA).map(id => ({
|
||||
id: CUDA[id],
|
||||
title: `CUDA ${id}`,
|
||||
})),
|
||||
defaultValue: DEFAULT_CUDA,
|
||||
},
|
||||
{
|
||||
id: 'config',
|
||||
title: 'Configuration',
|
||||
multiple: true,
|
||||
options: [
|
||||
{
|
||||
id: 'venv',
|
||||
title: 'virtual env',
|
||||
help:
|
||||
'Use a virtual environment and install spaCy into a user directory',
|
||||
},
|
||||
{
|
||||
id: 'train',
|
||||
title: 'train models',
|
||||
help:
|
||||
'Check this if you plan to train your own models with spaCy to install extra dependencies and data resources',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'models',
|
||||
title: 'Trained pipelines',
|
||||
|
@ -141,11 +143,6 @@ const QuickstartInstall = ({ id, title }) => {
|
|||
setters={setters}
|
||||
showDropdown={showDropdown}
|
||||
>
|
||||
{nightly && (
|
||||
<QS package="conda" comment prompt={false}>
|
||||
# 🚨 Nightly releases are currently only available via pip
|
||||
</QS>
|
||||
)}
|
||||
<QS config="venv">python -m venv .env</QS>
|
||||
<QS config="venv" os="mac">
|
||||
source .env/bin/activate
|
||||
|
@ -180,15 +177,17 @@ const QuickstartInstall = ({ id, title }) => {
|
|||
</QS>
|
||||
<QS package="source">pip install -r requirements.txt</QS>
|
||||
<QS package="source">python setup.py build_ext --inplace</QS>
|
||||
{(train || hardware == 'gpu') && (
|
||||
<QS package="source">pip install -e '.[{pipExtras}]'</QS>
|
||||
)}
|
||||
|
||||
<QS config="train" package="conda">
|
||||
conda install -c conda-forge spacy-transformers
|
||||
<QS package="source">
|
||||
pip install {train || hardware == 'gpu' ? `'.[${pipExtras}]'` : '.'}
|
||||
</QS>
|
||||
<QS config="train" package="conda" comment prompt={false}>
|
||||
# packages only available via pip
|
||||
</QS>
|
||||
<QS config="train" package="conda">
|
||||
conda install -c conda-forge spacy-lookups-data
|
||||
pip install spacy-transformers
|
||||
</QS>
|
||||
<QS config="train" package="conda">
|
||||
pip install spacy-lookups-data
|
||||
</QS>
|
||||
|
||||
{models.map(({ code, models: modelOptions }) => {
|
||||
|
|
Loading…
Reference in New Issue
Block a user