mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-16 15:11:59 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
576eeed849
|
@ -15,6 +15,7 @@ numpy>=1.15.0
|
|||
requests>=2.13.0,<3.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
pydantic>=1.5.0,<1.7.0
|
||||
jinja2
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
|
@ -26,4 +27,3 @@ pytest>=4.6.5
|
|||
pytest-timeout>=1.3.0,<2.0.0
|
||||
mock>=2.0.0,<3.0.0
|
||||
flake8>=3.5.0,<3.6.0
|
||||
jinja2
|
||||
|
|
|
@ -5,6 +5,7 @@ from wasabi import Printer, diff_strings
|
|||
from thinc.api import Config
|
||||
import srsly
|
||||
import re
|
||||
from jinja2 import Template
|
||||
|
||||
from .. import util
|
||||
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
|
||||
|
@ -127,10 +128,6 @@ def init_config(
|
|||
) -> None:
|
||||
is_stdout = str(output_file) == "-"
|
||||
msg = Printer(no_print=is_stdout)
|
||||
try:
|
||||
from jinja2 import Template
|
||||
except ImportError:
|
||||
msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
|
||||
with TEMPLATE_PATH.open("r") as f:
|
||||
template = Template(f.read())
|
||||
# Filter out duplicates since tok2vec and transformer are added by template
|
||||
|
|
|
@ -143,6 +143,9 @@ nO = null
|
|||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.textcat.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
|
||||
[components.textcat.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
|
|
|
@ -712,6 +712,10 @@ class Errors:
|
|||
E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
|
||||
"token itself. To set the morph from this MorphAnalysis, set from "
|
||||
"the string value with: `token.set_morph(str(other_morph))`.")
|
||||
E1014 = ("Error loading DocBin data. It doesn't look like the data is in "
|
||||
"DocBin (.spacy) format. If your data is in spaCy v2's JSON "
|
||||
"training format, convert it using `python -m spacy convert "
|
||||
"file.json .`.")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
|
|
|
@ -61,14 +61,14 @@ def build_bow_text_classifier(
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.TextCatEnsemble.v2")
|
||||
def build_text_classifier(
|
||||
def build_text_classifier_v2(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
linear_model: Model[List[Doc], Floats2d],
|
||||
nO: Optional[int] = None,
|
||||
) -> Model[List[Doc], Floats2d]:
|
||||
exclusive_classes = not linear_model.attrs["multi_label"]
|
||||
with Model.define_operators({">>": chain, "|": concatenate}):
|
||||
width = tok2vec.get_dim("nO")
|
||||
width = tok2vec.maybe_get_dim("nO")
|
||||
cnn_model = (
|
||||
tok2vec
|
||||
>> list2ragged()
|
||||
|
@ -94,7 +94,7 @@ def build_text_classifier(
|
|||
|
||||
# TODO: move to legacy
|
||||
@registry.architectures.register("spacy.TextCatEnsemble.v1")
|
||||
def build_text_classifier(
|
||||
def build_text_classifier_v1(
|
||||
width: int,
|
||||
embed_size: int,
|
||||
pretrained_vectors: Optional[bool],
|
||||
|
|
|
@ -92,9 +92,6 @@ class Morphologizer(Tagger):
|
|||
# 2) labels_pos stores a mapping from morph+POS->POS
|
||||
cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}}
|
||||
self.cfg = dict(sorted(cfg.items()))
|
||||
# add mappings for empty morph
|
||||
self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
|
||||
self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -201,8 +198,8 @@ class Morphologizer(Tagger):
|
|||
doc_tag_ids = doc_tag_ids.get()
|
||||
for j, tag_id in enumerate(doc_tag_ids):
|
||||
morph = self.labels[tag_id]
|
||||
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
|
||||
doc.c[j].pos = self.cfg["labels_pos"][morph]
|
||||
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
|
||||
doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
"""Find the loss and gradient of loss for the batch of documents and
|
||||
|
@ -228,12 +225,12 @@ class Morphologizer(Tagger):
|
|||
# doesn't, so if either is None, treat both as None here so that
|
||||
# truths doesn't end up with an unknown morph+POS combination
|
||||
if pos is None or morph is None:
|
||||
pos = None
|
||||
morph = None
|
||||
label_dict = Morphology.feats_to_dict(morph)
|
||||
if pos:
|
||||
label_dict[self.POS_FEAT] = pos
|
||||
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
||||
label = None
|
||||
else:
|
||||
label_dict = Morphology.feats_to_dict(morph)
|
||||
if pos:
|
||||
label_dict[self.POS_FEAT] = pos
|
||||
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
||||
eg_truths.append(label)
|
||||
truths.append(eg_truths)
|
||||
d_scores, loss = loss_func(scores, truths)
|
||||
|
|
|
@ -116,3 +116,23 @@ def test_overfitting_IO():
|
|||
no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]]
|
||||
assert_equal(batch_deps_1, batch_deps_2)
|
||||
assert_equal(batch_deps_1, no_batch_deps)
|
||||
|
||||
# Test without POS
|
||||
nlp.remove_pipe("morphologizer")
|
||||
nlp.add_pipe("morphologizer")
|
||||
for example in train_examples:
|
||||
for token in example.reference:
|
||||
token.pos_ = ""
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
for i in range(50):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["morphologizer"] < 0.00001
|
||||
|
||||
# Test the trained model
|
||||
test_text = "I like blue ham"
|
||||
doc = nlp(test_text)
|
||||
gold_morphs = ["Feat=N", "Feat=V", "", ""]
|
||||
gold_pos_tags = ["", "", "", ""]
|
||||
assert [str(t.morph) for t in doc] == gold_morphs
|
||||
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||
|
|
|
@ -198,7 +198,10 @@ class DocBin:
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/docbin#from_bytes
|
||||
"""
|
||||
msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
|
||||
try:
|
||||
msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
|
||||
except zlib.error:
|
||||
raise ValueError(Errors.E1014)
|
||||
self.attrs = msg["attrs"]
|
||||
self.strings = set(msg["strings"])
|
||||
lengths = numpy.frombuffer(msg["lengths"], dtype="int32")
|
||||
|
|
|
@ -61,11 +61,11 @@ on the transformer architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
|
||||
| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||
| Setting | Description |
|
||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
|
||||
| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
|
||||
|
|
|
@ -120,7 +120,7 @@ function formatAccuracy(data) {
|
|||
? null
|
||||
: {
|
||||
label,
|
||||
value: (value * 100).toFixed(2),
|
||||
value: value.toFixed(2),
|
||||
help: MODEL_META[label],
|
||||
}
|
||||
})
|
||||
|
|
Loading…
Reference in New Issue
Block a user