Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-10-13 15:41:28 +02:00
commit b42c0d5161
54 changed files with 365 additions and 263 deletions

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a43,<8.0.0a50",
"thinc>=8.0.0a44,<8.0.0a50",
"blis>=0.4.0,<0.8.0",
"pytokenizations",
"pathy"

View File

@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a43,<8.0.0a50
thinc>=8.0.0a44,<8.0.0a50
blis>=0.4.0,<0.8.0
ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a43,<8.0.0a50
thinc>=8.0.0a44,<8.0.0a50
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a43,<8.0.0a50
thinc>=8.0.0a44,<8.0.0a50
blis>=0.4.0,<0.8.0
wasabi>=0.8.0,<1.1.0
srsly>=2.3.0,<3.0.0

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
__version__ = "3.0.0a36"
__version__ = "3.0.0a41"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"

View File

@ -253,7 +253,7 @@ def _get_converter(msg, converter, input_path):
if converter == "auto":
converter = input_path.suffix[1:]
if converter == "ner" or converter == "iob":
with input_path.open() as file_:
with input_path.open(encoding="utf8") as file_:
input_data = file_.read()
converter_autodetect = autodetect_ner_format(input_data)
if converter_autodetect == "ner":

View File

@ -32,10 +32,10 @@ es:
word_vectors: null
transformer:
efficiency:
name: mrm8488/RuPERTa-base
name: dccuchile/bert-base-spanish-wwm-cased
size_factor: 3
accuracy:
name: mrm8488/RuPERTa-base
name: dccuchile/bert-base-spanish-wwm-cased
size_factor: 3
sv:
word_vectors: null
@ -101,3 +101,21 @@ pl:
accuracy:
name: dkleczek/bert-base-polish-cased-v1
size_factor: 3
nl:
word_vectors: null
transformer:
efficiency:
name: pdelobelle/robbert-v2-dutch-base
size_factor: 3
accuracy:
name: pdelobelle/robbert-v2-dutch-base
size_factor: 3
pt:
word_vectors: null
transformer:
efficiency:
name: neuralmind/bert-base-portuguese-cased
size_factor: 3
accuracy:
name: neuralmind/bert-base-portuguese-cased
size_factor: 3

View File

@ -456,6 +456,17 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues")
# TODO: fix numbering after merging develop into master
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
"is not set or None. If you've implemented a custom component, make "
"sure to store the component model as `self.model` in your "
"component's __init__ method.")
E899 = ("Can't serialize trainable pipe '{name}': the `vocab` attribute "
"is not set or None. If you've implemented a custom component, make "
"sure to store the current `nlp` object's vocab as `self.vocab` in "
"your component's __init__ method.")
E900 = ("Could not run the full pipeline for evaluation. If you specified "
"frozen components, make sure they were already initialized and "
"trained. Full pipeline: {pipeline}")
E901 = ("Failed to remove existing output directory: {path}. If your "
"config and the components you train change between runs, a "
"non-empty output directory can lead to stale pipeline data. To "

View File

@ -30,7 +30,6 @@ cdef class KnowledgeBase:
cdef Pool mem
cpdef readonly Vocab vocab
cdef int64_t entity_vector_length
cdef public set _added_strings
# This maps 64bit keys (hash of unique entity string)
# to 64bit values (position of the _KBEntryC struct in the _entries vector).

View File

@ -92,7 +92,6 @@ cdef class KnowledgeBase:
self._alias_index = PreshMap()
self.vocab = vocab
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
self._added_strings = set()
@property
def entity_vector_length(self):
@ -114,16 +113,12 @@ cdef class KnowledgeBase:
def get_alias_strings(self):
return [self.vocab.strings[x] for x in self._alias_index]
def add_string(self, string: str):
self._added_strings.add(string)
return self.vocab.strings.add(string)
def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
"""
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
Return the hash of the entity ID/name at the end.
"""
cdef hash_t entity_hash = self.add_string(entity)
cdef hash_t entity_hash = self.vocab.strings.add(entity)
# Return if this entity was added before
if entity_hash in self._entry_index:
@ -157,7 +152,7 @@ cdef class KnowledgeBase:
cdef hash_t entity_hash
while i < len(entity_list):
# only process this entity if its unique ID hadn't been added before
entity_hash = self.add_string(entity_list[i])
entity_hash = self.vocab.strings.add(entity_list[i])
if entity_hash in self._entry_index:
warnings.warn(Warnings.W018.format(entity=entity_list[i]))
@ -203,7 +198,7 @@ cdef class KnowledgeBase:
if prob_sum > 1.00001:
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
cdef hash_t alias_hash = self.add_string(alias)
cdef hash_t alias_hash = self.vocab.strings.add(alias)
# Check whether this alias was added before
if alias_hash in self._alias_index:
@ -332,7 +327,7 @@ cdef class KnowledgeBase:
raise ValueError(Errors.E928.format(loc=path))
serialize = {}
serialize["contents"] = lambda p: self.write_contents(p)
serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings)
serialize["strings.json"] = lambda p: self.vocab.strings.to_disk(p)
util.to_disk(path, serialize, exclude)
def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
@ -343,7 +338,7 @@ cdef class KnowledgeBase:
raise ValueError(Errors.E928.format(loc=path))
deserialize = {}
deserialize["contents"] = lambda p: self.read_contents(p)
deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)]
deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p)
util.from_disk(path, deserialize, exclude)
def write_contents(self, file_path):

View File

@ -62,6 +62,7 @@ _ordinal_words = [
_ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü")
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
@ -75,11 +76,11 @@ def like_num(text):
text_lower = text.lower()
#Check cardinal number
# Check cardinal number
if text_lower in _num_words:
return True
#Check ordinal number
# Check ordinal number
if text_lower in _ordinal_words:
return True
if text_lower.endswith(_ordinal_endings):

View File

@ -51,9 +51,8 @@ def noun_chunks(doclike):
elif word.dep == conj:
cc_token = word.left_edge
prev_end = cc_token.i
yield cc_token.right_edge.i + 1, extend_right(word), np_label # Shave off cc tokens from the NP
# Shave off cc tokens from the NP
yield cc_token.right_edge.i + 1, extend_right(word), np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -1,5 +1,5 @@
from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern
from typing import Tuple, Iterator
from typing import Tuple
from dataclasses import dataclass
import random
import itertools
@ -1034,6 +1034,9 @@ class Language:
)
)
disable = to_disable
# DisabledPipes will restore the pipes in 'disable' when it's done, so we need to exclude
# those pipes that were already disabled.
disable = [d for d in disable if d not in self._disabled]
return DisabledPipes(self, disable)
def make_doc(self, text: str) -> Doc:
@ -1194,7 +1197,9 @@ class Language:
doc = Doc(self.vocab, words=["x", "y", "z"])
get_examples = lambda: [Example.from_dict(doc, {})]
if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(method="Language.initialize", obj=type(get_examples))
err = Errors.E930.format(
method="Language.initialize", obj=type(get_examples)
)
raise TypeError(err)
# Make sure the config is interpolated so we can resolve subsections
config = self.config.interpolate()

View File

@ -1,3 +1,4 @@
from pathlib import Path
from typing import Optional, Callable, Iterable
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
from thinc.api import Model, Maxout, Linear
@ -25,7 +26,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
@registry.misc.register("spacy.KBFromFile.v1")
def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
def kb_from_file(vocab):
kb = KnowledgeBase(vocab, entity_vector_length=1)
kb.from_disk(kb_path)

View File

@ -24,11 +24,11 @@ def build_simple_cnn_text_classifier(
"""
with Model.define_operators({">>": chain}):
if exclusive_classes:
output_layer = Softmax(nO=nO, nI=tok2vec.get_dim("nO"))
output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
model.set_ref("output_layer", output_layer)
else:
linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO"))
linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
model = (
tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
)

View File

@ -110,7 +110,7 @@ def MultiHashEmbed(
The features used can be configured with the 'attrs' argument. The suggested
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
account some subword information, without construction a fully character-based
account some subword information, without constructing a fully character-based
representation. If pretrained vectors are available, they can be included in
the representation as well, with the vectors table will be kept static
(i.e. it's not updated).
@ -177,7 +177,7 @@ def CharacterEmbed(
rows: int,
nM: int,
nC: int,
also_use_static_vectors: bool,
include_static_vectors: bool,
feature: Union[int, str] = "LOWER",
) -> Model[List[Doc], List[Floats2d]]:
"""Construct an embedded representation based on character embeddings, using
@ -204,13 +204,13 @@ def CharacterEmbed(
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
are between 3 and 8, although it may depend on the length of words in the
language.
also_use_static_vectors (bool): Whether to also use static word vectors.
include_static_vectors (bool): Whether to also use static word vectors.
Requires a vectors table to be loaded in the Doc objects' vocab.
"""
feature = intify_attr(feature)
if feature is None:
raise ValueError(Errors.E911(feat=feature))
if also_use_static_vectors:
if include_static_vectors:
model = chain(
concatenate(
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),

View File

@ -39,7 +39,6 @@ def forward(
key_attr = model.attrs["key_attr"]
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
V = cast(Floats2d, docs[0].vocab.vectors.data)
mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate"))
rows = model.ops.flatten(
[doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
)
@ -47,8 +46,11 @@ def forward(
model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True),
model.ops.asarray([len(doc) for doc in docs], dtype="i"),
)
if mask is not None:
output.data *= mask
mask = None
if is_train:
mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate"))
if mask is not None:
output.data *= mask
def backprop(d_output: Ragged) -> List[Doc]:
if mask is not None:

View File

@ -1,4 +1,4 @@
from typing import List, Dict, Union, Iterable, Any, Optional, Callable, Iterator
from typing import List, Dict, Union, Iterable, Any, Optional, Callable
from typing import Tuple
import srsly
from pathlib import Path
@ -57,7 +57,6 @@ class AttributeRuler(Pipe):
self.attrs = []
self._attrs_unnormed = [] # store for reference
self.indices = []
self._added_strings = set()
def clear(self) -> None:
"""Reset all patterns."""
@ -187,16 +186,12 @@ class AttributeRuler(Pipe):
# We need to make a string here, because otherwise the ID we pass back
# will be interpreted as the hash of a string, rather than an ordinal.
key = str(len(self.attrs))
self.matcher.add(self.add_string(key), patterns)
self.matcher.add(self.vocab.strings.add(key), patterns)
self._attrs_unnormed.append(attrs)
attrs = normalize_token_attrs(self.vocab, attrs)
self.attrs.append(attrs)
self.indices.append(index)
def add_string(self, string: str):
self._added_strings.add(string)
return self.vocab.strings.add(string)
def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None:
"""Add patterns from a list of pattern dicts with the keys as the
arguments to AttributeRuler.add.
@ -256,8 +251,8 @@ class AttributeRuler(Pipe):
DOCS: https://nightly.spacy.io/api/attributeruler#to_bytes
"""
serialize = {}
serialize["vocab"] = self.vocab.to_bytes
serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns)
serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings))
return util.to_bytes(serialize, exclude)
def from_bytes(
@ -276,7 +271,7 @@ class AttributeRuler(Pipe):
self.add_patterns(srsly.msgpack_loads(b))
deserialize = {
"strings.json": lambda b: [self.add_string(s) for s in srsly.json_loads(b)],
"vocab": lambda b: self.vocab.from_bytes(b),
"patterns": load_patterns,
}
util.from_bytes(bytes_data, deserialize, exclude)
@ -293,7 +288,7 @@ class AttributeRuler(Pipe):
DOCS: https://nightly.spacy.io/api/attributeruler#to_disk
"""
serialize = {
"strings.json": lambda p: srsly.write_json(p, self._added_strings),
"vocab": lambda p: self.vocab.to_disk(p),
"patterns": lambda p: srsly.write_msgpack(p, self.patterns),
}
util.to_disk(path, serialize, exclude)
@ -314,7 +309,7 @@ class AttributeRuler(Pipe):
self.add_patterns(srsly.read_msgpack(p))
deserialize = {
"strings.json": lambda p: [self.add_string(s) for s in srsly.read_json(p)],
"vocab": lambda p: self.vocab.from_disk(p),
"patterns": load_patterns,
}
util.from_disk(path, deserialize, exclude)

View File

@ -453,6 +453,7 @@ class EntityLinker(TrainablePipe):
DOCS: https://nightly.spacy.io/api/entitylinker#to_disk
"""
serialize = {}
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["kb"] = lambda p: self.kb.to_disk(p)
serialize["model"] = lambda p: self.model.to_disk(p)
@ -481,8 +482,6 @@ class EntityLinker(TrainablePipe):
deserialize["kb"] = lambda p: self.kb.from_disk(p)
deserialize["model"] = load_model
util.from_disk(path, deserialize, exclude)
for s in self.kb._added_strings:
self.vocab.strings.add(s)
return self
def rehearse(self, examples, *, sgd=None, losses=None, **config):

View File

@ -281,6 +281,7 @@ class Lemmatizer(Pipe):
DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
"""
serialize = {}
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["lookups"] = lambda p: self.lookups.to_disk(p)
util.to_disk(path, serialize, exclude)
@ -296,6 +297,7 @@ class Lemmatizer(Pipe):
DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
"""
deserialize = {}
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
util.from_disk(path, deserialize, exclude)
self._validate_tables()
@ -310,6 +312,7 @@ class Lemmatizer(Pipe):
DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
"""
serialize = {}
serialize["vocab"] = self.vocab.to_bytes
serialize["lookups"] = self.lookups.to_bytes
return util.to_bytes(serialize, exclude)
@ -325,6 +328,7 @@ class Lemmatizer(Pipe):
DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
"""
deserialize = {}
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
util.from_bytes(bytes_data, deserialize, exclude)
self._validate_tables()

View File

@ -32,7 +32,7 @@ width = 128
rows = 7000
nM = 64
nC = 8
also_use_static_vectors = false
include_static_vectors = false
[model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
@ -95,7 +95,6 @@ class Morphologizer(Tagger):
# add mappings for empty morph
self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
self._added_strings = set()
@property
def labels(self):
@ -129,7 +128,6 @@ class Morphologizer(Tagger):
label_dict.pop(self.POS_FEAT)
# normalize morph string and add to morphology table
norm_morph = self.vocab.strings[self.vocab.morphology.add(label_dict)]
self.add_string(norm_morph)
# add label mappings
if norm_label not in self.cfg["labels_morph"]:
self.cfg["labels_morph"][norm_label] = norm_morph
@ -161,7 +159,6 @@ class Morphologizer(Tagger):
if pos:
morph_dict[self.POS_FEAT] = pos
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
self.add_string(norm_label)
# add label->morph and label->POS mappings
if norm_label not in self.cfg["labels_morph"]:
self.cfg["labels_morph"][norm_label] = morph
@ -179,7 +176,6 @@ class Morphologizer(Tagger):
if pos:
morph_dict[self.POS_FEAT] = pos
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
self.add_string(norm_label)
gold_array.append([1.0 if label == norm_label else 0.0 for label in self.labels])
doc_sample.append(example.x)
label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
@ -238,7 +234,6 @@ class Morphologizer(Tagger):
if pos:
label_dict[self.POS_FEAT] = pos
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
self.add_string(label)
eg_truths.append(label)
truths.append(eg_truths)
d_scores, loss = loss_func(scores, truths)

View File

@ -61,7 +61,6 @@ class SentenceRecognizer(Tagger):
self.name = name
self._rehearsal_model = None
self.cfg = {}
self._added_strings = set()
@property
def labels(self):

View File

@ -78,7 +78,6 @@ class Tagger(TrainablePipe):
self._rehearsal_model = None
cfg = {"labels": labels or []}
self.cfg = dict(sorted(cfg.items()))
self._added_strings = set()
@property
def labels(self):
@ -313,7 +312,7 @@ class Tagger(TrainablePipe):
return 0
self._allow_extra_label()
self.cfg["labels"].append(label)
self.add_string(label)
self.vocab.strings.add(label)
return 1
def score(self, examples, **kwargs):

View File

@ -110,7 +110,6 @@ class TextCategorizer(TrainablePipe):
self._rehearsal_model = None
cfg = {"labels": [], "threshold": threshold, "positive_label": None}
self.cfg = dict(cfg)
self._added_strings = set()
@property
def labels(self) -> Tuple[str]:
@ -301,7 +300,7 @@ class TextCategorizer(TrainablePipe):
return 0
self._allow_extra_label()
self.cfg["labels"].append(label)
self.add_string(label)
self.vocab.strings.add(label)
return 1
def initialize(

View File

@ -64,7 +64,6 @@ class Tok2Vec(TrainablePipe):
self.name = name
self.listeners = []
self.cfg = {}
self._added_strings = set()
def add_listener(self, listener: "Tok2VecListener") -> None:
"""Add a listener for a downstream component. Usually internals."""

View File

@ -5,4 +5,3 @@ cdef class TrainablePipe(Pipe):
cdef public Vocab vocab
cdef public object model
cdef public object cfg
cdef public set _added_strings

View File

@ -13,6 +13,7 @@ from ..vocab import Vocab
from ..language import Language
from ..training import Example
cdef class TrainablePipe(Pipe):
"""This class is a base class and not instantiated directly. Trainable
pipeline components like the EntityRecognizer or TextCategorizer inherit
@ -35,7 +36,6 @@ cdef class TrainablePipe(Pipe):
self.model = model
self.name = name
self.cfg = dict(cfg)
self._added_strings = set()
def __call__(self, Doc doc) -> Doc:
"""Apply the pipe to one document. The document is modified in place,
@ -198,10 +198,6 @@ cdef class TrainablePipe(Pipe):
"""
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
def add_string(self, string: str):
self._added_strings.add(string)
return self.vocab.strings.add(string)
@property
def is_trainable(self) -> bool:
return True
@ -244,6 +240,16 @@ cdef class TrainablePipe(Pipe):
"""
self.model.finish_update(sgd)
def _validate_serialization_attrs(self):
"""Check that the pipe implements the required attributes. If a subclass
implements a custom __init__ method but doesn't set these attributes,
they currently default to None, so we need to perform additonal checks.
"""
if not hasattr(self, "vocab") or self.vocab is None:
raise ValueError(Errors.E899.format(name=util.get_object_name(self)))
if not hasattr(self, "model") or self.model is None:
raise ValueError(Errors.E898.format(name=util.get_object_name(self)))
def to_bytes(self, *, exclude=tuple()):
"""Serialize the pipe to a bytestring.
@ -252,11 +258,12 @@ cdef class TrainablePipe(Pipe):
DOCS: https://nightly.spacy.io/api/pipe#to_bytes
"""
self._validate_serialization_attrs()
serialize = {}
if hasattr(self, "cfg"):
if hasattr(self, "cfg") and self.cfg is not None:
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
serialize["vocab"] = self.vocab.to_bytes
serialize["model"] = self.model.to_bytes
serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings))
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, *, exclude=tuple()):
@ -267,6 +274,7 @@ cdef class TrainablePipe(Pipe):
DOCS: https://nightly.spacy.io/api/pipe#from_bytes
"""
self._validate_serialization_attrs()
def load_model(b):
try:
@ -275,9 +283,9 @@ cdef class TrainablePipe(Pipe):
raise ValueError(Errors.E149) from None
deserialize = {}
deserialize["strings.json"] = lambda b: [self.add_string(s) for s in srsly.json_loads(b)]
if hasattr(self, "cfg"):
if hasattr(self, "cfg") and self.cfg is not None:
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
deserialize["model"] = load_model
util.from_bytes(bytes_data, deserialize, exclude)
return self
@ -290,10 +298,11 @@ cdef class TrainablePipe(Pipe):
DOCS: https://nightly.spacy.io/api/pipe#to_disk
"""
self._validate_serialization_attrs()
serialize = {}
if hasattr(self, "cfg"):
if hasattr(self, "cfg") and self.cfg is not None:
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings)
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["model"] = lambda p: self.model.to_disk(p)
util.to_disk(path, serialize, exclude)
@ -306,6 +315,7 @@ cdef class TrainablePipe(Pipe):
DOCS: https://nightly.spacy.io/api/pipe#from_disk
"""
self._validate_serialization_attrs()
def load_model(p):
try:
@ -314,9 +324,9 @@ cdef class TrainablePipe(Pipe):
raise ValueError(Errors.E149) from None
deserialize = {}
deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)]
if hasattr(self, "cfg"):
if hasattr(self, "cfg") and self.cfg is not None:
deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
deserialize["model"] = load_model
util.from_disk(path, deserialize, exclude)
return self

View File

@ -76,7 +76,6 @@ cdef class Parser(TrainablePipe):
self.add_multitask_objective(multitask)
self._rehearsal_model = None
self._added_strings = set()
def __getnewargs_ex__(self):
"""This allows pickling the Parser and its keyword-only init arguments"""
@ -120,7 +119,7 @@ cdef class Parser(TrainablePipe):
resized = True
if resized:
self._resize()
self.add_string(label)
self.vocab.strings.add(label)
return 1
return 0
@ -456,24 +455,24 @@ cdef class Parser(TrainablePipe):
def to_disk(self, path, exclude=tuple()):
serializers = {
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
'strings.json': lambda p: srsly.write_json(p, self._added_strings),
'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]),
'cfg': lambda p: srsly.write_json(p, self.cfg)
"model": lambda p: (self.model.to_disk(p) if self.model is not True else True),
"vocab": lambda p: self.vocab.to_disk(p),
"moves": lambda p: self.moves.to_disk(p, exclude=["strings"]),
"cfg": lambda p: srsly.write_json(p, self.cfg)
}
util.to_disk(path, serializers, exclude)
def from_disk(self, path, exclude=tuple()):
deserializers = {
'strings.json': lambda p: [self.add_string(s) for s in srsly.read_json(p)],
'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]),
'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
'model': lambda p: None,
"vocab": lambda p: self.vocab.from_disk(p),
"moves": lambda p: self.moves.from_disk(p, exclude=["strings"]),
"cfg": lambda p: self.cfg.update(srsly.read_json(p)),
"model": lambda p: None,
}
util.from_disk(path, deserializers, exclude)
if 'model' not in exclude:
if "model" not in exclude:
path = util.ensure_path(path)
with (path / 'model').open('rb') as file_:
with (path / "model").open("rb") as file_:
bytes_data = file_.read()
try:
self._resize()
@ -485,7 +484,7 @@ cdef class Parser(TrainablePipe):
def to_bytes(self, exclude=tuple()):
serializers = {
"model": lambda: (self.model.to_bytes()),
"strings.json": lambda: srsly.json_dumps(sorted(self._added_strings)),
"vocab": lambda: self.vocab.to_bytes(),
"moves": lambda: self.moves.to_bytes(exclude=["strings"]),
"cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)
}
@ -493,7 +492,7 @@ cdef class Parser(TrainablePipe):
def from_bytes(self, bytes_data, exclude=tuple()):
deserializers = {
"strings.json": lambda b: [self.add_string(s) for s in srsly.json_loads(b)],
"vocab": lambda b: self.vocab.from_bytes(b),
"moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]),
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
"model": lambda b: None,

View File

@ -282,7 +282,7 @@ class ModelMetaSchema(BaseModel):
sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
vectors: Dict[str, Any] = Field({}, title="Included word vectors")
labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
performance: Dict[str, Union[float, Dict[str, Union[float, dict]]]] = Field({}, title="Accuracy and speed numbers")
performance: Dict[str, Any] = Field({}, title="Accuracy and speed numbers")
spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
# fmt: on

View File

@ -239,10 +239,12 @@ def th_tokenizer():
def tr_tokenizer():
return get_lang_class("tr")().tokenizer
@pytest.fixture(scope="session")
def tr_vocab():
return get_lang_class("tr").Defaults.create_vocab()
@pytest.fixture(scope="session")
def tt_tokenizer():
return get_lang_class("tt")().tokenizer

View File

@ -608,14 +608,11 @@ def test_doc_init_iob():
doc = Doc(Vocab(), words=words, ents=ents)
@pytest.mark.xfail
def test_doc_set_ents_spans(en_tokenizer):
def test_doc_set_ents_invalid_spans(en_tokenizer):
doc = en_tokenizer("Some text about Colombia and the Czech Republic")
spans = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")]
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
# If this line is uncommented, it works:
# print(spans)
doc.ents = spans
assert [ent.text for ent in doc.ents] == ["Colombia", "Czech Republic"]
with pytest.raises(IndexError):
doc.ents = spans

View File

@ -336,6 +336,7 @@ def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
attrs = {"lemma": "none", "ent_type": "none"}
retokenizer.merge(doc[0:2], attrs=attrs)
retokenizer.merge(doc[-2:], attrs=attrs)
sent1, sent2 = list(doc.sents)
assert len(sent1) == init_len - 1
assert len(sent2) == init_len2 - 1

View File

@ -225,7 +225,7 @@ def test_tr_noun_chunks_acl_nmod(tr_tokenizer):
assert chunks[0].text_with_ws == "en sevdiğim ses sanatçısı "
def test_tr_noun_chunks_acl_nmod(tr_tokenizer):
def test_tr_noun_chunks_acl_nmod2(tr_tokenizer):
text = "bildiğim bir turizm şirketi"
heads = [3, 3, 3, 3]
deps = ["acl", "det", "nmod", "ROOT"]
@ -326,7 +326,7 @@ def test_tr_noun_chunks_np_recursive_no_nmod(tr_tokenizer):
def test_tr_noun_chunks_np_recursive_long_two_acls(tr_tokenizer):
text = "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo"
heads = [6, 2, 3, 5, 5, 6, 9, 6, 9, 9]
deps = ["obl", "nmod" , "obl", "acl", "det", "nsubj", "acl", "aux", "det", "ROOT"]
deps = ["obl", "nmod", "obl", "acl", "det", "nsubj", "acl", "aux", "det", "ROOT"]
pos = ["ADP", "PROPN", "NOUN", "VERB", "DET", "NOUN", "VERB", "AUX", "DET", "NOUN"]
tokens = tr_tokenizer(text)
doc = Doc(
@ -334,7 +334,10 @@ def test_tr_noun_chunks_np_recursive_long_two_acls(tr_tokenizer):
)
chunks = list(doc.noun_chunks)
assert len(chunks) == 1
assert chunks[0].text_with_ws == "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo "
assert (
chunks[0].text_with_ws
== "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo "
)
def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
@ -350,7 +353,8 @@ def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
assert len(chunks) == 1
assert chunks[0].text_with_ws == "kız ve erkek çocuklar "
def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
def test_tr_noun_chunks_two_nouns_in_nmod2(tr_tokenizer):
text = "tatlı ve gürbüz çocuklar"
heads = [3, 2, 0, 3]
deps = ["amod", "cc", "conj", "ROOT"]
@ -378,6 +382,7 @@ def test_tr_noun_chunks_conj_simple(tr_tokenizer):
assert chunks[0].text_with_ws == "ben "
assert chunks[1].text_with_ws == "Sen "
def test_tr_noun_chunks_conj_three(tr_tokenizer):
text = "sen, ben ve ondan"
heads = [0, 2, 0, 4, 0]
@ -394,7 +399,7 @@ def test_tr_noun_chunks_conj_three(tr_tokenizer):
assert chunks[2].text_with_ws == "sen "
def test_tr_noun_chunks_conj_three(tr_tokenizer):
def test_tr_noun_chunks_conj_three2(tr_tokenizer):
text = "ben ya da sen ya da onlar"
heads = [0, 3, 1, 0, 6, 4, 3]
deps = ["ROOT", "cc", "fixed", "conj", "cc", "fixed", "conj"]
@ -499,7 +504,7 @@ def test_tr_noun_chunks_flat_names_and_title(tr_tokenizer):
assert chunks[0].text_with_ws == "Gazi Mustafa Kemal "
def test_tr_noun_chunks_flat_names_and_title(tr_tokenizer):
def test_tr_noun_chunks_flat_names_and_title2(tr_tokenizer):
text = "Ahmet Vefik Paşa"
heads = [2, 0, 2]
deps = ["nmod", "flat", "ROOT"]

View File

@ -15,8 +15,8 @@ from spacy.lang.tr.lex_attrs import like_num
"üçüncü",
"beşinci",
"100üncü",
"8inci"
]
"8inci",
],
)
def test_tr_lex_attrs_like_number_cardinal_ordinal(word):
assert like_num(word)
@ -26,4 +26,3 @@ def test_tr_lex_attrs_like_number_cardinal_ordinal(word):
def test_tr_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())

View File

@ -5,6 +5,7 @@ from spacy.kb import KnowledgeBase, get_candidates, Candidate
from spacy.vocab import Vocab
from spacy import util, registry
from spacy.ml import load_kb
from spacy.scorer import Scorer
from spacy.training import Example
from spacy.lang.en import English
@ -121,9 +122,7 @@ def test_kb_default(nlp):
def test_kb_custom_length(nlp):
"""Test that the default (empty) KB can be configured with a custom entity length"""
entity_linker = nlp.add_pipe(
"entity_linker", config={"entity_vector_length": 35}
)
entity_linker = nlp.add_pipe("entity_linker", config={"entity_vector_length": 35})
assert len(entity_linker.kb) == 0
assert entity_linker.kb.get_size_entities() == 0
assert entity_linker.kb.get_size_aliases() == 0
@ -213,16 +212,11 @@ def test_el_pipe_configuration(nlp):
kb = KnowledgeBase(vocab, entity_vector_length=1)
kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
kb.add_entity(entity="Q3", freq=5, entity_vector=[3])
kb.add_alias(
alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]
)
kb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
return kb
# run an EL pipe without a trained context encoder, to check the candidate generation step only
entity_linker = nlp.add_pipe(
"entity_linker",
config={"incl_context": False},
)
entity_linker = nlp.add_pipe("entity_linker", config={"incl_context": False})
entity_linker.set_kb(create_kb)
# With the default get_candidates function, matching is case-sensitive
text = "Douglas and douglas are not the same."
@ -453,14 +447,10 @@ def test_overfitting_IO():
return mykb
# Create the Entity Linker component and add it to the pipeline
entity_linker = nlp.add_pipe(
"entity_linker",
last=True,
)
entity_linker = nlp.add_pipe("entity_linker", last=True)
entity_linker.set_kb(create_kb)
assert "Q2146908" in entity_linker.vocab.strings
assert "Q2146908" in entity_linker.kb.vocab.strings
assert "Q2146908" in entity_linker.kb._added_strings
# train the NEL pipe
optimizer = nlp.initialize(get_examples=lambda: train_examples)
@ -507,6 +497,32 @@ def test_overfitting_IO():
assert predictions == GOLD_entities
def test_kb_serialization():
# Test that the KB can be used in a pipeline with a different vocab
vector_length = 3
with make_tempdir() as tmp_dir:
kb_dir = tmp_dir / "kb"
nlp1 = English()
assert "Q2146908" not in nlp1.vocab.strings
mykb = KnowledgeBase(nlp1.vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
assert "Q2146908" in nlp1.vocab.strings
mykb.to_disk(kb_dir)
nlp2 = English()
assert "RandomWord" not in nlp2.vocab.strings
nlp2.vocab.strings.add("RandomWord")
assert "RandomWord" in nlp2.vocab.strings
assert "Q2146908" not in nlp2.vocab.strings
# Create the Entity Linker component with the KB from file, and check the final vocab
entity_linker = nlp2.add_pipe("entity_linker", last=True)
entity_linker.set_kb(load_kb(kb_dir))
assert "Q2146908" in nlp2.vocab.strings
assert "RandomWord" in nlp2.vocab.strings
def test_scorer_links():
train_examples = []
nlp = English()

View File

@ -101,4 +101,3 @@ def test_overfitting_IO():
doc2 = nlp2(test_text)
assert [str(t.morph) for t in doc2] == gold_morphs
assert [t.pos_ for t in doc2] == gold_pos_tags
assert nlp.get_pipe("morphologizer")._added_strings == nlp2.get_pipe("morphologizer")._added_strings

View File

@ -129,6 +129,7 @@ def test_enable_pipes_method(nlp, name):
@pytest.mark.parametrize("name", ["my_component"])
def test_disable_pipes_context(nlp, name):
"""Test that an enabled component stays enabled after running the context manager."""
nlp.add_pipe("new_pipe", name=name)
assert nlp.has_pipe(name)
with nlp.select_pipes(disable=name):
@ -136,6 +137,18 @@ def test_disable_pipes_context(nlp, name):
assert nlp.has_pipe(name)
@pytest.mark.parametrize("name", ["my_component"])
def test_disable_pipes_context_restore(nlp, name):
"""Test that a disabled component stays disabled after running the context manager."""
nlp.add_pipe("new_pipe", name=name)
assert nlp.has_pipe(name)
nlp.disable_pipe(name)
assert not nlp.has_pipe(name)
with nlp.select_pipes(disable=name):
assert not nlp.has_pipe(name)
assert not nlp.has_pipe(name)
def test_select_pipes_list_arg(nlp):
for name in ["c1", "c2", "c3"]:
nlp.add_pipe("new_pipe", name=name)

View File

@ -80,4 +80,3 @@ def test_overfitting_IO():
nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2(test_text)
assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts
assert nlp.get_pipe("senter")._added_strings == nlp2.get_pipe("senter")._added_strings

View File

@ -98,7 +98,6 @@ def test_overfitting_IO():
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["tagger"] < 0.00001
assert tagger._added_strings == {"J", "N", "V"}
# test the trained model
test_text = "I like blue eggs"
@ -117,7 +116,6 @@ def test_overfitting_IO():
assert doc2[1].tag_ is "V"
assert doc2[2].tag_ is "J"
assert doc2[3].tag_ is "N"
assert nlp2.get_pipe("tagger")._added_strings == {"J", "N", "V"}
def test_tagger_requires_labels():

View File

@ -146,7 +146,6 @@ def test_overfitting_IO():
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
optimizer = nlp.initialize(get_examples=lambda: train_examples)
assert textcat.model.get_dim("nO") == 2
assert textcat._added_strings == {"NEGATIVE", "POSITIVE"}
for i in range(50):
losses = {}
@ -168,7 +167,6 @@ def test_overfitting_IO():
cats2 = doc2.cats
assert cats2["POSITIVE"] > 0.9
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001)
assert nlp2.get_pipe("textcat")._added_strings == {"NEGATIVE", "POSITIVE"}
# Test scoring
scores = nlp.evaluate(train_examples)

View File

@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
[
(8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
(8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
],
)
# fmt: on

View File

@ -7,6 +7,7 @@ from spacy.kb import KnowledgeBase, Writer
from spacy.vectors import Vectors
from spacy.language import Language
from spacy.pipeline import TrainablePipe
from spacy.vocab import Vocab
from ..util import make_tempdir
@ -50,8 +51,9 @@ def custom_pipe():
else:
self.cfg = None
self.model = SerializableDummy()
self.vocab = vocab
return MyPipe(None)
return MyPipe(Vocab())
def tagger():

View File

@ -6,8 +6,8 @@ def test_issue6207(en_tokenizer):
# Make spans
s1 = doc[:4]
s2 = doc[3:6] # overlaps with s1
s3 = doc[5:7] # overlaps with s2, not s1
s2 = doc[3:6] # overlaps with s1
s3 = doc[5:7] # overlaps with s2, not s1
result = filter_spans((s1, s2, s3))
assert s1 in result

View File

@ -1,13 +1,13 @@
import pytest
import srsly
from spacy import registry, Vocab
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
from spacy.pipeline import TextCategorizer, SentenceRecognizer
from spacy.pipeline import TextCategorizer, SentenceRecognizer, TrainablePipe
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
from spacy.lang.en import English
from thinc.api import Linear
import spacy
from ..util import make_tempdir
@ -89,7 +89,6 @@ def test_serialize_parser_strings(Parser):
assert label not in vocab2.strings
parser2 = Parser(vocab2, model, **config)
parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"]))
assert parser1._added_strings == parser2._added_strings == {"FunnyLabel"}
assert label in parser2.vocab.strings
@ -166,17 +165,13 @@ def test_serialize_tagger_strings(en_vocab, de_vocab, taggers):
# check that custom labels are serialized as part of the component's strings.jsonl
tagger.add_label(label)
assert label in tagger.vocab.strings
assert tagger._added_strings == {label}
file_path = d / "tagger1"
tagger.to_disk(file_path)
strings = srsly.read_json(file_path / "strings.json")
assert strings == ["SomeWeirdLabel"]
# ensure that the custom strings are loaded back in when using the tagger in another pipeline
cfg = {"model": DEFAULT_TAGGER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
tagger2 = Tagger(de_vocab, model).from_disk(file_path)
assert label in tagger2.vocab.strings
assert tagger2._added_strings == {label}
def test_serialize_textcat_empty(en_vocab):
@ -253,3 +248,40 @@ def test_serialize_pipeline_disable_enable():
assert nlp5.pipe_names == ["ner"]
assert nlp5.component_names == ["ner"]
assert nlp5.disabled == []
def test_serialize_custom_trainable_pipe():
class BadCustomPipe1(TrainablePipe):
def __init__(self, vocab):
pass
class BadCustomPipe2(TrainablePipe):
def __init__(self, vocab):
self.vocab = vocab
self.model = None
class CustomPipe(TrainablePipe):
def __init__(self, vocab, model):
self.vocab = vocab
self.model = model
pipe = BadCustomPipe1(Vocab())
with pytest.raises(ValueError):
pipe.to_bytes()
with make_tempdir() as d:
with pytest.raises(ValueError):
pipe.to_disk(d)
pipe = BadCustomPipe2(Vocab())
with pytest.raises(ValueError):
pipe.to_bytes()
with make_tempdir() as d:
with pytest.raises(ValueError):
pipe.to_disk(d)
pipe = CustomPipe(Vocab(), Linear())
pipe_bytes = pipe.to_bytes()
new_pipe = CustomPipe(Vocab(), Linear()).from_bytes(pipe_bytes)
assert new_pipe.to_bytes() == pipe_bytes
with make_tempdir() as d:
pipe.to_disk(d)
new_pipe = CustomPipe(Vocab(), Linear()).from_disk(d)
assert new_pipe.to_bytes() == pipe_bytes

View File

@ -1,10 +1,8 @@
from typing import List
import pytest
from thinc.api import fix_random_seed, Adam, set_dropout_rate
from numpy.testing import assert_array_equal
import numpy
from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
from spacy.ml.staticvectors import StaticVectors
@ -188,12 +186,7 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
assert_array_equal(get_all_params(model1), get_all_params(model2))
@pytest.mark.parametrize(
"model_func,kwargs",
[
(StaticVectors, {"nO": 128, "nM": 300}),
]
)
@pytest.mark.parametrize("model_func,kwargs", [(StaticVectors, {"nO": 128, "nM": 300})])
def test_empty_docs(model_func, kwargs):
nlp = English()
model = model_func(**kwargs).initialize()
@ -201,7 +194,7 @@ def test_empty_docs(model_func, kwargs):
for n_docs in range(3):
docs = [nlp("") for _ in range(n_docs)]
# Test predict
_ = model.predict(docs)
model.predict(docs)
# Test backprop
output, backprop = model.begin_update(docs)
_ = backprop(output)
backprop(output)

View File

@ -1419,7 +1419,7 @@ cdef class Doc:
if include_annotation["POS"]:
token_data["pos"] = token.pos_
if include_annotation["MORPH"]:
token_data["morph"] = token.morph
token_data["morph"] = token.morph.to_json()
if include_annotation["LEMMA"]:
token_data["lemma"] = token.lemma_
if include_annotation["DEP"]:

View File

@ -16,5 +16,4 @@ cdef class Span:
cdef public _vector
cdef public _vector_norm
cpdef int _recalculate_indices(self) except -1
cpdef np.ndarray to_array(self, object features)

View File

@ -150,7 +150,6 @@ cdef class Span:
DOCS: https://nightly.spacy.io/api/span#len
"""
self._recalculate_indices()
if self.end < self.start:
return 0
return self.end - self.start
@ -167,7 +166,6 @@ cdef class Span:
DOCS: https://nightly.spacy.io/api/span#getitem
"""
self._recalculate_indices()
if isinstance(i, slice):
start, end = normalize_slice(len(self), i.start, i.stop, i.step)
return Span(self.doc, start + self.start, end + self.start)
@ -188,7 +186,6 @@ cdef class Span:
DOCS: https://nightly.spacy.io/api/span#iter
"""
self._recalculate_indices()
for i in range(self.start, self.end):
yield self.doc[i]
@ -339,19 +336,6 @@ cdef class Span:
output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature)
return output
cpdef int _recalculate_indices(self) except -1:
if self.end > self.doc.length \
or self.doc.c[self.start].idx != self.start_char \
or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char:
start = token_by_start(self.doc.c, self.doc.length, self.start_char)
if self.start == -1:
raise IndexError(Errors.E036.format(start=self.start_char))
end = token_by_end(self.doc.c, self.doc.length, self.end_char)
if end == -1:
raise IndexError(Errors.E037.format(end=self.end_char))
self.start = start
self.end = end + 1
@property
def vocab(self):
"""RETURNS (Vocab): The Span's Doc's vocab."""
@ -520,7 +504,6 @@ cdef class Span:
DOCS: https://nightly.spacy.io/api/span#root
"""
self._recalculate_indices()
if "root" in self.doc.user_span_hooks:
return self.doc.user_span_hooks["root"](self)
# This should probably be called 'head', and the other one called

View File

@ -11,11 +11,25 @@ if TYPE_CHECKING:
from ..language import Language # noqa: F401
def setup_table(
*, cols: List[str], widths: List[int], max_width: int = 13
) -> Tuple[List[str], List[int], List[str]]:
final_cols = []
final_widths = []
for col, width in zip(cols, widths):
if len(col) > max_width:
col = col[: max_width - 3] + "..." # shorten column if too long
final_cols.append(col.upper())
final_widths.append(max(len(col), width))
return final_cols, final_widths, ["r" for _ in final_widths]
@registry.loggers("spacy.ConsoleLogger.v1")
def console_logger(progress_bar: bool = False):
def setup_printer(
nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]:
write = lambda text: stdout.write(f"{text}\n")
msg = Printer(no_print=True)
# ensure that only trainable components are logged
logged_pipes = [
@ -26,15 +40,14 @@ def console_logger(progress_bar: bool = False):
eval_frequency = nlp.config["training"]["eval_frequency"]
score_weights = nlp.config["training"]["score_weights"]
score_cols = [col for col, value in score_weights.items() if value is not None]
score_widths = [max(len(col), 6) for col in score_cols]
loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
loss_widths = [max(len(col), 8) for col in loss_cols]
table_header = ["E", "#"] + loss_cols + score_cols + ["Score"]
table_header = [col.upper() for col in table_header]
table_widths = [3, 6] + loss_widths + score_widths + [6]
table_aligns = ["r" for _ in table_widths]
stdout.write(msg.row(table_header, widths=table_widths) + "\n")
stdout.write(msg.row(["-" * width for width in table_widths]) + "\n")
spacing = 2
table_header, table_widths, table_aligns = setup_table(
cols=["E", "#"] + loss_cols + score_cols + ["Score"],
widths=[3, 6] + [8 for _ in loss_cols] + [6 for _ in score_cols] + [6],
)
write(msg.row(table_header, widths=table_widths, spacing=spacing))
write(msg.row(["-" * width for width in table_widths], spacing=spacing))
progress = None
def log_step(info: Optional[Dict[str, Any]]) -> None:
@ -70,7 +83,9 @@ def console_logger(progress_bar: bool = False):
)
if progress is not None:
progress.close()
stdout.write(msg.row(data, widths=table_widths, aligns=table_aligns) + "\n")
write(
msg.row(data, widths=table_widths, aligns=table_aligns, spacing=spacing)
)
if progress_bar:
# Set disable=None, so that it disables on non-TTY
progress = tqdm.tqdm(

View File

@ -249,7 +249,10 @@ def create_evaluation_callback(
def evaluate() -> Tuple[float, Dict[str, float]]:
dev_examples = list(dev_corpus(nlp))
scores = nlp.evaluate(dev_examples)
try:
scores = nlp.evaluate(dev_examples)
except KeyError as e:
raise KeyError(Errors.E900.format(pipeline=nlp.pipe_names)) from e
# Calculate a weighted sum based on score_weights for the main score.
# We can only consider scores that are ints/floats, not dicts like
# entity scores per type etc.

View File

@ -622,7 +622,7 @@ def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
if not path.parent.exists():
raise IOError(Errors.E052.format(path=path.parent))
if not path.exists() or not path.is_file():
raise IOError(Errors.E053.format(path=path, name="meta.json"))
raise IOError(Errors.E053.format(path=path.parent, name="meta.json"))
meta = srsly.read_json(path)
for setting in ["lang", "name", "version"]:
if setting not in meta or not meta[setting]:
@ -821,7 +821,7 @@ def get_object_name(obj: Any) -> str:
obj (Any): The Python object, typically a function or class.
RETURNS (str): A human-readable name.
"""
if hasattr(obj, "name"):
if hasattr(obj, "name") and obj.name is not None:
return obj.name
if hasattr(obj, "__name__"):
return obj.__name__
@ -1361,11 +1361,12 @@ def check_bool_env_var(env_var: str) -> bool:
def _pipe(docs, proc, kwargs):
if hasattr(proc, "pipe"):
yield from proc.pipe(docs, **kwargs)
# We added some args for pipe that __call__ doesn't expect.
kwargs = dict(kwargs)
for arg in ["batch_size"]:
if arg in kwargs:
kwargs.pop(arg)
for doc in docs:
doc = proc(doc, **kwargs)
yield doc
else:
# We added some args for pipe that __call__ doesn't expect.
kwargs = dict(kwargs)
for arg in ["batch_size"]:
if arg in kwargs:
kwargs.pop(arg)
for doc in docs:
doc = proc(doc, **kwargs)
yield doc

View File

@ -637,13 +637,6 @@ into the "real world". This requires 3 main components:
> window_size = 1
> maxout_pieces = 3
> subword_features = true
>
> [kb_loader]
> @misc = "spacy.EmptyKB.v1"
> entity_vector_length = 64
>
> [get_candidates]
> @misc = "spacy.CandidateGenerator.v1"
> ```
The `EntityLinker` model architecture is a Thinc `Model` with a
@ -657,13 +650,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
### spacy.EmptyKB.v1 {#EmptyKB}
A function that creates a default, empty `KnowledgeBase` from a
[`Vocab`](/api/vocab) instance.
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
instance. This is the default when a new entity linker component is created.
| Name | Description |
| ---------------------- | ----------------------------------------------------------------------------------- |
| `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
### spacy.KBFromFile.v1 {#KBFromFile}
A function that reads an existing `KnowledgeBase` from file.
| Name | Description |
| --------- | -------------------------------------------------------- |
| `kb_path` | The location of the KB that was stored to file. ~~Path~~ |
### spacy.CandidateGenerator.v1 {#CandidateGenerator}
A function that takes as input a [`KnowledgeBase`](/api/kb) and a

View File

@ -34,20 +34,20 @@ architectures and their arguments and hyperparameters.
> "incl_prior": True,
> "incl_context": True,
> "model": DEFAULT_NEL_MODEL,
> "kb_loader": {'@misc': 'spacy.EmptyKB.v1', 'entity_vector_length': 64},
> "entity_vector_length": 64,
> "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'},
> }
> nlp.add_pipe("entity_linker", config=config)
> ```
| Setting | Description |
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. Defaults to [EmptyKB](/api/architectures#EmptyKB), a function returning an empty `KnowledgeBase` with an `entity_vector_length` of `64`. ~~Callable[[Vocab], KnowledgeBase]~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| Setting | Description |
| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
@ -65,10 +65,6 @@ architectures and their arguments and hyperparameters.
> config = {"model": {"@architectures": "my_el.v1"}}
> entity_linker = nlp.add_pipe("entity_linker", config=config)
>
> # Construction via add_pipe with custom KB and candidate generation
> config = {"kb": {"@misc": "my_kb.v1"}}
> entity_linker = nlp.add_pipe("entity_linker", config=config)
>
> # Construction from class
> from spacy.pipeline import EntityLinker
> entity_linker = EntityLinker(nlp.vocab, model)
@ -76,21 +72,25 @@ architectures and their arguments and hyperparameters.
Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe). Note that both the internal
`KnowledgeBase` as well as the Candidate generator can be customized by
providing custom registered functions.
[`nlp.add_pipe`](/api/language#add_pipe).
| Name | Description |
| ---------------- | -------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | |
| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ |
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ |
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
Upon construction of the entity linker component, an empty knowledge base is
constructed with the provided `entity_vector_length`. If you want to use a
custom knowledge base, you should either call
[`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the
[`initialize`](/api/entitylinker#initialize) call.
| Name | Description |
| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | |
| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ |
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ |
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
## EntityLinker.\_\_call\_\_ {#call tag="method"}
@ -139,6 +139,28 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## EntityLinker.set_kb {#initialize tag="method" new="3"}
The `kb_loader` should be a function that takes a `Vocab` instance and creates
the `KnowledgeBase`, ensuring that the strings of the knowledge base are synced
with the current vocab.
> #### Example
>
> ```python
> def create_kb(vocab):
> kb = KnowledgeBase(vocab, entity_vector_length=128)
> kb.add_entity(...)
> kb.add_alias(...)
> return kb
> entity_linker = nlp.add_pipe("entity_linker")
> entity_linker.set_kb(lambda: [], nlp=nlp, kb_loader=create_kb)
> ```
| Name | Description |
| ----------- | ---------------------------------------------------------------------------------------------------------------- |
| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ |
## EntityLinker.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that
@ -150,6 +172,11 @@ network,
setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize).
Optionally, a `kb_loader` argument may be specified to change the internal
knowledge base. This argument should be a function that takes a `Vocab` instance
and creates the `KnowledgeBase`, ensuring that the strings of the knowledge base
are synced with the current vocab.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
@ -160,7 +187,7 @@ This method was previously called `begin_training`.
>
> ```python
> entity_linker = nlp.add_pipe("entity_linker")
> entity_linker.initialize(lambda: [], nlp=nlp)
> entity_linker.initialize(lambda: [], nlp=nlp, kb_loader=my_kb)
> ```
| Name | Description |
@ -168,6 +195,7 @@ This method was previously called `begin_training`.
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ |
## EntityLinker.predict {#predict tag="method"}

View File

@ -516,17 +516,15 @@ Many neural network models are able to use word vector tables as additional
features, which sometimes results in significant improvements in accuracy.
spaCy's built-in embedding layer,
[MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use
word vector tables using the `also_use_static_vectors` flag. This setting is
also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN)
layer, which builds the default token-to-vector encoding architecture.
word vector tables using the `include_static_vectors` flag.
```ini
[tagger.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = 128
rows = 7000
also_embed_subwords = true
also_use_static_vectors = true
attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"]
rows = [5000,2500,2500,2500]
include_static_vectors = true
```
<Infobox title="How it works" emoji="💡">

View File

@ -1403,9 +1403,9 @@ especially useful it you want to pass in a string instead of calling
This example shows the implementation of a pipeline component that fetches
country meta data via the [REST Countries API](https://restcountries.eu), sets
entity annotations for countries, merges entities into one token and sets custom
attributes on the `Doc`, `Span` and `Token` for example, the capital,
latitude/longitude coordinates and even the country flag.
entity annotations for countries and sets custom attributes on the `Doc` and
`Span` for example, the capital, latitude/longitude coordinates and even the
country flag.
```python
### {executable="true"}
@ -1427,54 +1427,46 @@ class RESTCountriesComponent:
# Set up the PhraseMatcher with Doc patterns for each country name
self.matcher = PhraseMatcher(nlp.vocab)
self.matcher.add("COUNTRIES", [nlp.make_doc(c) for c in self.countries.keys()])
# Register attribute on the Token. We'll be overwriting this based on
# Register attributes on the Span. We'll be overwriting this based on
# the matches, so we're only setting a default value, not a getter.
Token.set_extension("is_country", default=False)
Token.set_extension("country_capital", default=False)
Token.set_extension("country_latlng", default=False)
Token.set_extension("country_flag", default=False)
# Register attributes on Doc and Span via a getter that checks if one of
# the contained tokens is set to is_country == True.
Span.set_extension("is_country", default=None)
Span.set_extension("country_capital", default=None)
Span.set_extension("country_latlng", default=None)
Span.set_extension("country_flag", default=None)
# Register attribute on Doc via a getter that checks if the Doc
# contains a country entity
Doc.set_extension("has_country", getter=self.has_country)
Span.set_extension("has_country", getter=self.has_country)
def __call__(self, doc):
spans = [] # keep the spans for later so we can merge them afterwards
for _, start, end in self.matcher(doc):
# Generate Span representing the entity & set label
entity = Span(doc, start, end, label=self.label)
# Set custom attributes on entity. Can be extended with other data
# returned by the API, like currencies, country code, calling code etc.
entity._.set("is_country", True)
entity._.set("country_capital", self.countries[entity.text]["capital"])
entity._.set("country_latlng", self.countries[entity.text]["latlng"])
entity._.set("country_flag", self.countries[entity.text]["flag"])
spans.append(entity)
# Set custom attribute on each token of the entity
# Can be extended with other data returned by the API, like
# currencies, country code, flag, calling code etc.
for token in entity:
token._.set("is_country", True)
token._.set("country_capital", self.countries[entity.text]["capital"])
token._.set("country_latlng", self.countries[entity.text]["latlng"])
token._.set("country_flag", self.countries[entity.text]["flag"])
# Iterate over all spans and merge them into one token
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
# Overwrite doc.ents and add entity be careful not to replace!
doc.ents = list(doc.ents) + spans
return doc # don't forget to return the Doc!
def has_country(self, tokens):
"""Getter for Doc and Span attributes. Since the getter is only called
when we access the attribute, we can refer to the Token's 'is_country'
def has_country(self, doc):
"""Getter for Doc attributes. Since the getter is only called
when we access the attribute, we can refer to the Span's 'is_country'
attribute here, which is already set in the processing step."""
return any([t._.get("is_country") for t in tokens])
return any([entity._.get("is_country") for entity in doc.ents])
nlp = English()
nlp.add_pipe("rest_countries", config={"label": "GPE"})
doc = nlp("Some text about Colombia and the Czech Republic")
print("Pipeline", nlp.pipe_names) # pipeline contains component name
print("Doc has countries", doc._.has_country) # Doc contains countries
for token in doc:
if token._.is_country:
print(token.text, token._.country_capital, token._.country_latlng, token._.country_flag)
print("Entities", [(e.text, e.label_) for e in doc.ents])
for ent in doc.ents:
if ent._.is_country:
print(ent.text, ent.label_, ent._.country_capital, ent._.country_latlng, ent._.country_flag)
```
In this case, all data can be fetched on initialization in one request. However,