Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-11-27 15:17:33 +08:00
commit 576eeed849
10 changed files with 50 additions and 26 deletions

View File

@ -15,6 +15,7 @@ numpy>=1.15.0
requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0
pydantic>=1.5.0,<1.7.0
jinja2
# Official Python utilities
setuptools
packaging>=20.0
@ -26,4 +27,3 @@ pytest>=4.6.5
pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.5.0,<3.6.0
jinja2

View File

@ -5,6 +5,7 @@ from wasabi import Printer, diff_strings
from thinc.api import Config
import srsly
import re
from jinja2 import Template
from .. import util
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
@ -127,10 +128,6 @@ def init_config(
) -> None:
is_stdout = str(output_file) == "-"
msg = Printer(no_print=is_stdout)
try:
from jinja2 import Template
except ImportError:
msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
with TEMPLATE_PATH.open("r") as f:
template = Template(f.read())
# Filter out duplicates since tok2vec and transformer are added by template

View File

@ -143,6 +143,9 @@ nO = null
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.textcat.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false

View File

@ -712,6 +712,10 @@ class Errors:
E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
"token itself. To set the morph from this MorphAnalysis, set from "
"the string value with: `token.set_morph(str(other_morph))`.")
E1014 = ("Error loading DocBin data. It doesn't look like the data is in "
"DocBin (.spacy) format. If your data is in spaCy v2's JSON "
"training format, convert it using `python -m spacy convert "
"file.json .`.")
# Deprecated model shortcuts, only used in errors and warnings

View File

@ -61,14 +61,14 @@ def build_bow_text_classifier(
@registry.architectures.register("spacy.TextCatEnsemble.v2")
def build_text_classifier(
def build_text_classifier_v2(
tok2vec: Model[List[Doc], List[Floats2d]],
linear_model: Model[List[Doc], Floats2d],
nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]:
exclusive_classes = not linear_model.attrs["multi_label"]
with Model.define_operators({">>": chain, "|": concatenate}):
width = tok2vec.get_dim("nO")
width = tok2vec.maybe_get_dim("nO")
cnn_model = (
tok2vec
>> list2ragged()
@ -94,7 +94,7 @@ def build_text_classifier(
# TODO: move to legacy
@registry.architectures.register("spacy.TextCatEnsemble.v1")
def build_text_classifier(
def build_text_classifier_v1(
width: int,
embed_size: int,
pretrained_vectors: Optional[bool],

View File

@ -92,9 +92,6 @@ class Morphologizer(Tagger):
# 2) labels_pos stores a mapping from morph+POS->POS
cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}}
self.cfg = dict(sorted(cfg.items()))
# add mappings for empty morph
self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
@property
def labels(self):
@ -201,8 +198,8 @@ class Morphologizer(Tagger):
doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids):
morph = self.labels[tag_id]
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
doc.c[j].pos = self.cfg["labels_pos"][morph]
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
def get_loss(self, examples, scores):
"""Find the loss and gradient of loss for the batch of documents and
@ -228,12 +225,12 @@ class Morphologizer(Tagger):
# doesn't, so if either is None, treat both as None here so that
# truths doesn't end up with an unknown morph+POS combination
if pos is None or morph is None:
pos = None
morph = None
label_dict = Morphology.feats_to_dict(morph)
if pos:
label_dict[self.POS_FEAT] = pos
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
label = None
else:
label_dict = Morphology.feats_to_dict(morph)
if pos:
label_dict[self.POS_FEAT] = pos
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
eg_truths.append(label)
truths.append(eg_truths)
d_scores, loss = loss_func(scores, truths)

View File

@ -116,3 +116,23 @@ def test_overfitting_IO():
no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]]
assert_equal(batch_deps_1, batch_deps_2)
assert_equal(batch_deps_1, no_batch_deps)
# Test without POS
nlp.remove_pipe("morphologizer")
nlp.add_pipe("morphologizer")
for example in train_examples:
for token in example.reference:
token.pos_ = ""
optimizer = nlp.initialize(get_examples=lambda: train_examples)
for i in range(50):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["morphologizer"] < 0.00001
# Test the trained model
test_text = "I like blue ham"
doc = nlp(test_text)
gold_morphs = ["Feat=N", "Feat=V", "", ""]
gold_pos_tags = ["", "", "", ""]
assert [str(t.morph) for t in doc] == gold_morphs
assert [t.pos_ for t in doc] == gold_pos_tags

View File

@ -198,7 +198,10 @@ class DocBin:
DOCS: https://nightly.spacy.io/api/docbin#from_bytes
"""
msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
try:
msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
except zlib.error:
raise ValueError(Errors.E1014)
self.attrs = msg["attrs"]
self.strings = set(msg["strings"])
lengths = numpy.frombuffer(msg["lengths"], dtype="int32")

View File

@ -61,11 +61,11 @@ on the transformer architectures and their arguments and hyperparameters.
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
> ```
| Setting | Description |
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
| Setting | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
```python
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py

View File

@ -120,7 +120,7 @@ function formatAccuracy(data) {
? null
: {
label,
value: (value * 100).toFixed(2),
value: value.toFixed(2),
help: MODEL_META[label],
}
})