mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-14 18:22:27 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
576eeed849
|
@ -15,6 +15,7 @@ numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
pydantic>=1.5.0,<1.7.0
|
pydantic>=1.5.0,<1.7.0
|
||||||
|
jinja2
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
|
@ -26,4 +27,3 @@ pytest>=4.6.5
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.5.0,<3.6.0
|
flake8>=3.5.0,<3.6.0
|
||||||
jinja2
|
|
||||||
|
|
|
@ -5,6 +5,7 @@ from wasabi import Printer, diff_strings
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
import srsly
|
import srsly
|
||||||
import re
|
import re
|
||||||
|
from jinja2 import Template
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
|
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
|
||||||
|
@ -127,10 +128,6 @@ def init_config(
|
||||||
) -> None:
|
) -> None:
|
||||||
is_stdout = str(output_file) == "-"
|
is_stdout = str(output_file) == "-"
|
||||||
msg = Printer(no_print=is_stdout)
|
msg = Printer(no_print=is_stdout)
|
||||||
try:
|
|
||||||
from jinja2 import Template
|
|
||||||
except ImportError:
|
|
||||||
msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
|
|
||||||
with TEMPLATE_PATH.open("r") as f:
|
with TEMPLATE_PATH.open("r") as f:
|
||||||
template = Template(f.read())
|
template = Template(f.read())
|
||||||
# Filter out duplicates since tok2vec and transformer are added by template
|
# Filter out duplicates since tok2vec and transformer are added by template
|
||||||
|
|
|
@ -143,6 +143,9 @@ nO = null
|
||||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
grad_factor = 1.0
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.textcat.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
|
||||||
[components.textcat.model.linear_model]
|
[components.textcat.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
|
|
|
@ -712,6 +712,10 @@ class Errors:
|
||||||
E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
|
E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
|
||||||
"token itself. To set the morph from this MorphAnalysis, set from "
|
"token itself. To set the morph from this MorphAnalysis, set from "
|
||||||
"the string value with: `token.set_morph(str(other_morph))`.")
|
"the string value with: `token.set_morph(str(other_morph))`.")
|
||||||
|
E1014 = ("Error loading DocBin data. It doesn't look like the data is in "
|
||||||
|
"DocBin (.spacy) format. If your data is in spaCy v2's JSON "
|
||||||
|
"training format, convert it using `python -m spacy convert "
|
||||||
|
"file.json .`.")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -61,14 +61,14 @@ def build_bow_text_classifier(
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatEnsemble.v2")
|
@registry.architectures.register("spacy.TextCatEnsemble.v2")
|
||||||
def build_text_classifier(
|
def build_text_classifier_v2(
|
||||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
linear_model: Model[List[Doc], Floats2d],
|
linear_model: Model[List[Doc], Floats2d],
|
||||||
nO: Optional[int] = None,
|
nO: Optional[int] = None,
|
||||||
) -> Model[List[Doc], Floats2d]:
|
) -> Model[List[Doc], Floats2d]:
|
||||||
exclusive_classes = not linear_model.attrs["multi_label"]
|
exclusive_classes = not linear_model.attrs["multi_label"]
|
||||||
with Model.define_operators({">>": chain, "|": concatenate}):
|
with Model.define_operators({">>": chain, "|": concatenate}):
|
||||||
width = tok2vec.get_dim("nO")
|
width = tok2vec.maybe_get_dim("nO")
|
||||||
cnn_model = (
|
cnn_model = (
|
||||||
tok2vec
|
tok2vec
|
||||||
>> list2ragged()
|
>> list2ragged()
|
||||||
|
@ -94,7 +94,7 @@ def build_text_classifier(
|
||||||
|
|
||||||
# TODO: move to legacy
|
# TODO: move to legacy
|
||||||
@registry.architectures.register("spacy.TextCatEnsemble.v1")
|
@registry.architectures.register("spacy.TextCatEnsemble.v1")
|
||||||
def build_text_classifier(
|
def build_text_classifier_v1(
|
||||||
width: int,
|
width: int,
|
||||||
embed_size: int,
|
embed_size: int,
|
||||||
pretrained_vectors: Optional[bool],
|
pretrained_vectors: Optional[bool],
|
||||||
|
|
|
@ -92,9 +92,6 @@ class Morphologizer(Tagger):
|
||||||
# 2) labels_pos stores a mapping from morph+POS->POS
|
# 2) labels_pos stores a mapping from morph+POS->POS
|
||||||
cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}}
|
cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}}
|
||||||
self.cfg = dict(sorted(cfg.items()))
|
self.cfg = dict(sorted(cfg.items()))
|
||||||
# add mappings for empty morph
|
|
||||||
self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
|
|
||||||
self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -201,8 +198,8 @@ class Morphologizer(Tagger):
|
||||||
doc_tag_ids = doc_tag_ids.get()
|
doc_tag_ids = doc_tag_ids.get()
|
||||||
for j, tag_id in enumerate(doc_tag_ids):
|
for j, tag_id in enumerate(doc_tag_ids):
|
||||||
morph = self.labels[tag_id]
|
morph = self.labels[tag_id]
|
||||||
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
|
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
|
||||||
doc.c[j].pos = self.cfg["labels_pos"][morph]
|
doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
"""Find the loss and gradient of loss for the batch of documents and
|
"""Find the loss and gradient of loss for the batch of documents and
|
||||||
|
@ -228,12 +225,12 @@ class Morphologizer(Tagger):
|
||||||
# doesn't, so if either is None, treat both as None here so that
|
# doesn't, so if either is None, treat both as None here so that
|
||||||
# truths doesn't end up with an unknown morph+POS combination
|
# truths doesn't end up with an unknown morph+POS combination
|
||||||
if pos is None or morph is None:
|
if pos is None or morph is None:
|
||||||
pos = None
|
label = None
|
||||||
morph = None
|
else:
|
||||||
label_dict = Morphology.feats_to_dict(morph)
|
label_dict = Morphology.feats_to_dict(morph)
|
||||||
if pos:
|
if pos:
|
||||||
label_dict[self.POS_FEAT] = pos
|
label_dict[self.POS_FEAT] = pos
|
||||||
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
||||||
eg_truths.append(label)
|
eg_truths.append(label)
|
||||||
truths.append(eg_truths)
|
truths.append(eg_truths)
|
||||||
d_scores, loss = loss_func(scores, truths)
|
d_scores, loss = loss_func(scores, truths)
|
||||||
|
|
|
@ -116,3 +116,23 @@ def test_overfitting_IO():
|
||||||
no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]]
|
no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]]
|
||||||
assert_equal(batch_deps_1, batch_deps_2)
|
assert_equal(batch_deps_1, batch_deps_2)
|
||||||
assert_equal(batch_deps_1, no_batch_deps)
|
assert_equal(batch_deps_1, no_batch_deps)
|
||||||
|
|
||||||
|
# Test without POS
|
||||||
|
nlp.remove_pipe("morphologizer")
|
||||||
|
nlp.add_pipe("morphologizer")
|
||||||
|
for example in train_examples:
|
||||||
|
for token in example.reference:
|
||||||
|
token.pos_ = ""
|
||||||
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
for i in range(50):
|
||||||
|
losses = {}
|
||||||
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
assert losses["morphologizer"] < 0.00001
|
||||||
|
|
||||||
|
# Test the trained model
|
||||||
|
test_text = "I like blue ham"
|
||||||
|
doc = nlp(test_text)
|
||||||
|
gold_morphs = ["Feat=N", "Feat=V", "", ""]
|
||||||
|
gold_pos_tags = ["", "", "", ""]
|
||||||
|
assert [str(t.morph) for t in doc] == gold_morphs
|
||||||
|
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||||
|
|
|
@ -198,7 +198,10 @@ class DocBin:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/docbin#from_bytes
|
DOCS: https://nightly.spacy.io/api/docbin#from_bytes
|
||||||
"""
|
"""
|
||||||
msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
|
try:
|
||||||
|
msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
|
||||||
|
except zlib.error:
|
||||||
|
raise ValueError(Errors.E1014)
|
||||||
self.attrs = msg["attrs"]
|
self.attrs = msg["attrs"]
|
||||||
self.strings = set(msg["strings"])
|
self.strings = set(msg["strings"])
|
||||||
lengths = numpy.frombuffer(msg["lengths"], dtype="int32")
|
lengths = numpy.frombuffer(msg["lengths"], dtype="int32")
|
||||||
|
|
|
@ -61,11 +61,11 @@ on the transformer architectures and their arguments and hyperparameters.
|
||||||
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
|
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
|
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
|
||||||
| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
|
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
|
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
|
||||||
|
|
|
@ -120,7 +120,7 @@ function formatAccuracy(data) {
|
||||||
? null
|
? null
|
||||||
: {
|
: {
|
||||||
label,
|
label,
|
||||||
value: (value * 100).toFixed(2),
|
value: value.toFixed(2),
|
||||||
help: MODEL_META[label],
|
help: MODEL_META[label],
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
Loading…
Reference in New Issue
Block a user