Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-11-27 15:17:33 +08:00
commit 576eeed849
10 changed files with 50 additions and 26 deletions

View File

@ -15,6 +15,7 @@ numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
pydantic>=1.5.0,<1.7.0 pydantic>=1.5.0,<1.7.0
jinja2
# Official Python utilities # Official Python utilities
setuptools setuptools
packaging>=20.0 packaging>=20.0
@ -26,4 +27,3 @@ pytest>=4.6.5
pytest-timeout>=1.3.0,<2.0.0 pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0
flake8>=3.5.0,<3.6.0 flake8>=3.5.0,<3.6.0
jinja2

View File

@ -5,6 +5,7 @@ from wasabi import Printer, diff_strings
from thinc.api import Config from thinc.api import Config
import srsly import srsly
import re import re
from jinja2 import Template
from .. import util from .. import util
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
@ -127,10 +128,6 @@ def init_config(
) -> None: ) -> None:
is_stdout = str(output_file) == "-" is_stdout = str(output_file) == "-"
msg = Printer(no_print=is_stdout) msg = Printer(no_print=is_stdout)
try:
from jinja2 import Template
except ImportError:
msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
with TEMPLATE_PATH.open("r") as f: with TEMPLATE_PATH.open("r") as f:
template = Template(f.read()) template = Template(f.read())
# Filter out duplicates since tok2vec and transformer are added by template # Filter out duplicates since tok2vec and transformer are added by template

View File

@ -143,6 +143,9 @@ nO = null
@architectures = "spacy-transformers.TransformerListener.v1" @architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0 grad_factor = 1.0
[components.textcat.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
[components.textcat.model.linear_model] [components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1" @architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false exclusive_classes = false

View File

@ -712,6 +712,10 @@ class Errors:
E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the " E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
"token itself. To set the morph from this MorphAnalysis, set from " "token itself. To set the morph from this MorphAnalysis, set from "
"the string value with: `token.set_morph(str(other_morph))`.") "the string value with: `token.set_morph(str(other_morph))`.")
E1014 = ("Error loading DocBin data. It doesn't look like the data is in "
"DocBin (.spacy) format. If your data is in spaCy v2's JSON "
"training format, convert it using `python -m spacy convert "
"file.json .`.")
# Deprecated model shortcuts, only used in errors and warnings # Deprecated model shortcuts, only used in errors and warnings

View File

@ -61,14 +61,14 @@ def build_bow_text_classifier(
@registry.architectures.register("spacy.TextCatEnsemble.v2") @registry.architectures.register("spacy.TextCatEnsemble.v2")
def build_text_classifier( def build_text_classifier_v2(
tok2vec: Model[List[Doc], List[Floats2d]], tok2vec: Model[List[Doc], List[Floats2d]],
linear_model: Model[List[Doc], Floats2d], linear_model: Model[List[Doc], Floats2d],
nO: Optional[int] = None, nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]: ) -> Model[List[Doc], Floats2d]:
exclusive_classes = not linear_model.attrs["multi_label"] exclusive_classes = not linear_model.attrs["multi_label"]
with Model.define_operators({">>": chain, "|": concatenate}): with Model.define_operators({">>": chain, "|": concatenate}):
width = tok2vec.get_dim("nO") width = tok2vec.maybe_get_dim("nO")
cnn_model = ( cnn_model = (
tok2vec tok2vec
>> list2ragged() >> list2ragged()
@ -94,7 +94,7 @@ def build_text_classifier(
# TODO: move to legacy # TODO: move to legacy
@registry.architectures.register("spacy.TextCatEnsemble.v1") @registry.architectures.register("spacy.TextCatEnsemble.v1")
def build_text_classifier( def build_text_classifier_v1(
width: int, width: int,
embed_size: int, embed_size: int,
pretrained_vectors: Optional[bool], pretrained_vectors: Optional[bool],

View File

@ -92,9 +92,6 @@ class Morphologizer(Tagger):
# 2) labels_pos stores a mapping from morph+POS->POS # 2) labels_pos stores a mapping from morph+POS->POS
cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}} cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}}
self.cfg = dict(sorted(cfg.items())) self.cfg = dict(sorted(cfg.items()))
# add mappings for empty morph
self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
@property @property
def labels(self): def labels(self):
@ -201,8 +198,8 @@ class Morphologizer(Tagger):
doc_tag_ids = doc_tag_ids.get() doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids): for j, tag_id in enumerate(doc_tag_ids):
morph = self.labels[tag_id] morph = self.labels[tag_id]
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph]) doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
doc.c[j].pos = self.cfg["labels_pos"][morph] doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
def get_loss(self, examples, scores): def get_loss(self, examples, scores):
"""Find the loss and gradient of loss for the batch of documents and """Find the loss and gradient of loss for the batch of documents and
@ -228,12 +225,12 @@ class Morphologizer(Tagger):
# doesn't, so if either is None, treat both as None here so that # doesn't, so if either is None, treat both as None here so that
# truths doesn't end up with an unknown morph+POS combination # truths doesn't end up with an unknown morph+POS combination
if pos is None or morph is None: if pos is None or morph is None:
pos = None label = None
morph = None else:
label_dict = Morphology.feats_to_dict(morph) label_dict = Morphology.feats_to_dict(morph)
if pos: if pos:
label_dict[self.POS_FEAT] = pos label_dict[self.POS_FEAT] = pos
label = self.vocab.strings[self.vocab.morphology.add(label_dict)] label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
eg_truths.append(label) eg_truths.append(label)
truths.append(eg_truths) truths.append(eg_truths)
d_scores, loss = loss_func(scores, truths) d_scores, loss = loss_func(scores, truths)

View File

@ -116,3 +116,23 @@ def test_overfitting_IO():
no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]] no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]]
assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, batch_deps_2)
assert_equal(batch_deps_1, no_batch_deps) assert_equal(batch_deps_1, no_batch_deps)
# Test without POS
nlp.remove_pipe("morphologizer")
nlp.add_pipe("morphologizer")
for example in train_examples:
for token in example.reference:
token.pos_ = ""
optimizer = nlp.initialize(get_examples=lambda: train_examples)
for i in range(50):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["morphologizer"] < 0.00001
# Test the trained model
test_text = "I like blue ham"
doc = nlp(test_text)
gold_morphs = ["Feat=N", "Feat=V", "", ""]
gold_pos_tags = ["", "", "", ""]
assert [str(t.morph) for t in doc] == gold_morphs
assert [t.pos_ for t in doc] == gold_pos_tags

View File

@ -198,7 +198,10 @@ class DocBin:
DOCS: https://nightly.spacy.io/api/docbin#from_bytes DOCS: https://nightly.spacy.io/api/docbin#from_bytes
""" """
msg = srsly.msgpack_loads(zlib.decompress(bytes_data)) try:
msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
except zlib.error:
raise ValueError(Errors.E1014)
self.attrs = msg["attrs"] self.attrs = msg["attrs"]
self.strings = set(msg["strings"]) self.strings = set(msg["strings"])
lengths = numpy.frombuffer(msg["lengths"], dtype="int32") lengths = numpy.frombuffer(msg["lengths"], dtype="int32")

View File

@ -61,11 +61,11 @@ on the transformer architectures and their arguments and hyperparameters.
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG) > nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
> ``` > ```
| Setting | Description | | Setting | Description |
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ | | `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | | `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ | | `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
```python ```python
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py

View File

@ -120,7 +120,7 @@ function formatAccuracy(data) {
? null ? null
: { : {
label, label,
value: (value * 100).toFixed(2), value: value.toFixed(2),
help: MODEL_META[label], help: MODEL_META[label],
} }
}) })