Revert added_strings change (#6236)

This commit is contained in:
Ines Montani 2020-10-10 18:55:07 +02:00 committed by GitHub
parent 796f8b9424
commit bfa3931c9d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 110 additions and 94 deletions

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a37" __version__ = "3.0.0a38"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"

View File

@ -456,6 +456,14 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues") "issue tracker: http://github.com/explosion/spaCy/issues")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
"is not set or None. If you've implemented a custom component, make "
"sure to store the component model as `self.model` in your "
"component's __init__ method.")
E899 = ("Can't serialize trainable pipe '{name}': the `vocab` attribute "
"is not set or None. If you've implemented a custom component, make "
"sure to store the current `nlp` object's vocab as `self.vocab` in "
"your component's __init__ method.")
E900 = ("Could not run the full pipeline for evaluation. If you specified " E900 = ("Could not run the full pipeline for evaluation. If you specified "
"frozen components, make sure they were already initialized and " "frozen components, make sure they were already initialized and "
"trained. Full pipeline: {pipeline}") "trained. Full pipeline: {pipeline}")

View File

@ -30,7 +30,6 @@ cdef class KnowledgeBase:
cdef Pool mem cdef Pool mem
cpdef readonly Vocab vocab cpdef readonly Vocab vocab
cdef int64_t entity_vector_length cdef int64_t entity_vector_length
cdef public set _added_strings
# This maps 64bit keys (hash of unique entity string) # This maps 64bit keys (hash of unique entity string)
# to 64bit values (position of the _KBEntryC struct in the _entries vector). # to 64bit values (position of the _KBEntryC struct in the _entries vector).

View File

@ -92,7 +92,6 @@ cdef class KnowledgeBase:
self._alias_index = PreshMap() self._alias_index = PreshMap()
self.vocab = vocab self.vocab = vocab
self._create_empty_vectors(dummy_hash=self.vocab.strings[""]) self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
self._added_strings = set()
@property @property
def entity_vector_length(self): def entity_vector_length(self):
@ -114,16 +113,12 @@ cdef class KnowledgeBase:
def get_alias_strings(self): def get_alias_strings(self):
return [self.vocab.strings[x] for x in self._alias_index] return [self.vocab.strings[x] for x in self._alias_index]
def add_string(self, string: str):
self._added_strings.add(string)
return self.vocab.strings.add(string)
def add_entity(self, unicode entity, float freq, vector[float] entity_vector): def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
""" """
Add an entity to the KB, optionally specifying its log probability based on corpus frequency Add an entity to the KB, optionally specifying its log probability based on corpus frequency
Return the hash of the entity ID/name at the end. Return the hash of the entity ID/name at the end.
""" """
cdef hash_t entity_hash = self.add_string(entity) cdef hash_t entity_hash = self.vocab.strings.add(entity)
# Return if this entity was added before # Return if this entity was added before
if entity_hash in self._entry_index: if entity_hash in self._entry_index:
@ -157,7 +152,7 @@ cdef class KnowledgeBase:
cdef hash_t entity_hash cdef hash_t entity_hash
while i < len(entity_list): while i < len(entity_list):
# only process this entity if its unique ID hadn't been added before # only process this entity if its unique ID hadn't been added before
entity_hash = self.add_string(entity_list[i]) entity_hash = self.vocab.strings.add(entity_list[i])
if entity_hash in self._entry_index: if entity_hash in self._entry_index:
warnings.warn(Warnings.W018.format(entity=entity_list[i])) warnings.warn(Warnings.W018.format(entity=entity_list[i]))
@ -203,7 +198,7 @@ cdef class KnowledgeBase:
if prob_sum > 1.00001: if prob_sum > 1.00001:
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum)) raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
cdef hash_t alias_hash = self.add_string(alias) cdef hash_t alias_hash = self.vocab.strings.add(alias)
# Check whether this alias was added before # Check whether this alias was added before
if alias_hash in self._alias_index: if alias_hash in self._alias_index:
@ -332,7 +327,7 @@ cdef class KnowledgeBase:
raise ValueError(Errors.E928.format(loc=path)) raise ValueError(Errors.E928.format(loc=path))
serialize = {} serialize = {}
serialize["contents"] = lambda p: self.write_contents(p) serialize["contents"] = lambda p: self.write_contents(p)
serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings) serialize["strings.json"] = lambda p: self.vocab.strings.to_disk(p)
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
@ -343,7 +338,7 @@ cdef class KnowledgeBase:
raise ValueError(Errors.E928.format(loc=path)) raise ValueError(Errors.E928.format(loc=path))
deserialize = {} deserialize = {}
deserialize["contents"] = lambda p: self.read_contents(p) deserialize["contents"] = lambda p: self.read_contents(p)
deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)] deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p)
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
def write_contents(self, file_path): def write_contents(self, file_path):

View File

@ -1,4 +1,4 @@
from typing import List, Dict, Union, Iterable, Any, Optional, Callable, Iterator from typing import List, Dict, Union, Iterable, Any, Optional, Callable
from typing import Tuple from typing import Tuple
import srsly import srsly
from pathlib import Path from pathlib import Path
@ -57,7 +57,6 @@ class AttributeRuler(Pipe):
self.attrs = [] self.attrs = []
self._attrs_unnormed = [] # store for reference self._attrs_unnormed = [] # store for reference
self.indices = [] self.indices = []
self._added_strings = set()
def clear(self) -> None: def clear(self) -> None:
"""Reset all patterns.""" """Reset all patterns."""
@ -187,16 +186,12 @@ class AttributeRuler(Pipe):
# We need to make a string here, because otherwise the ID we pass back # We need to make a string here, because otherwise the ID we pass back
# will be interpreted as the hash of a string, rather than an ordinal. # will be interpreted as the hash of a string, rather than an ordinal.
key = str(len(self.attrs)) key = str(len(self.attrs))
self.matcher.add(self.add_string(key), patterns) self.matcher.add(self.vocab.strings.add(key), patterns)
self._attrs_unnormed.append(attrs) self._attrs_unnormed.append(attrs)
attrs = normalize_token_attrs(self.vocab, attrs) attrs = normalize_token_attrs(self.vocab, attrs)
self.attrs.append(attrs) self.attrs.append(attrs)
self.indices.append(index) self.indices.append(index)
def add_string(self, string: str):
self._added_strings.add(string)
return self.vocab.strings.add(string)
def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None: def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None:
"""Add patterns from a list of pattern dicts with the keys as the """Add patterns from a list of pattern dicts with the keys as the
arguments to AttributeRuler.add. arguments to AttributeRuler.add.
@ -256,8 +251,8 @@ class AttributeRuler(Pipe):
DOCS: https://nightly.spacy.io/api/attributeruler#to_bytes DOCS: https://nightly.spacy.io/api/attributeruler#to_bytes
""" """
serialize = {} serialize = {}
serialize["vocab"] = self.vocab.to_bytes
serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns) serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns)
serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings))
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes( def from_bytes(
@ -276,7 +271,7 @@ class AttributeRuler(Pipe):
self.add_patterns(srsly.msgpack_loads(b)) self.add_patterns(srsly.msgpack_loads(b))
deserialize = { deserialize = {
"strings.json": lambda b: [self.add_string(s) for s in srsly.json_loads(b)], "vocab": lambda b: self.vocab.from_bytes(b),
"patterns": load_patterns, "patterns": load_patterns,
} }
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
@ -293,7 +288,7 @@ class AttributeRuler(Pipe):
DOCS: https://nightly.spacy.io/api/attributeruler#to_disk DOCS: https://nightly.spacy.io/api/attributeruler#to_disk
""" """
serialize = { serialize = {
"strings.json": lambda p: srsly.write_json(p, self._added_strings), "vocab": lambda p: self.vocab.to_disk(p),
"patterns": lambda p: srsly.write_msgpack(p, self.patterns), "patterns": lambda p: srsly.write_msgpack(p, self.patterns),
} }
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
@ -314,7 +309,7 @@ class AttributeRuler(Pipe):
self.add_patterns(srsly.read_msgpack(p)) self.add_patterns(srsly.read_msgpack(p))
deserialize = { deserialize = {
"strings.json": lambda p: [self.add_string(s) for s in srsly.read_json(p)], "vocab": lambda p: self.vocab.from_disk(p),
"patterns": load_patterns, "patterns": load_patterns,
} }
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)

View File

@ -453,6 +453,7 @@ class EntityLinker(TrainablePipe):
DOCS: https://nightly.spacy.io/api/entitylinker#to_disk DOCS: https://nightly.spacy.io/api/entitylinker#to_disk
""" """
serialize = {} serialize = {}
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["kb"] = lambda p: self.kb.to_disk(p) serialize["kb"] = lambda p: self.kb.to_disk(p)
serialize["model"] = lambda p: self.model.to_disk(p) serialize["model"] = lambda p: self.model.to_disk(p)
@ -481,8 +482,6 @@ class EntityLinker(TrainablePipe):
deserialize["kb"] = lambda p: self.kb.from_disk(p) deserialize["kb"] = lambda p: self.kb.from_disk(p)
deserialize["model"] = load_model deserialize["model"] = load_model
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
for s in self.kb._added_strings:
self.vocab.strings.add(s)
return self return self
def rehearse(self, examples, *, sgd=None, losses=None, **config): def rehearse(self, examples, *, sgd=None, losses=None, **config):

View File

@ -281,6 +281,7 @@ class Lemmatizer(Pipe):
DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
""" """
serialize = {} serialize = {}
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["lookups"] = lambda p: self.lookups.to_disk(p) serialize["lookups"] = lambda p: self.lookups.to_disk(p)
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
@ -296,6 +297,7 @@ class Lemmatizer(Pipe):
DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
""" """
deserialize = {} deserialize = {}
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
deserialize["lookups"] = lambda p: self.lookups.from_disk(p) deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
self._validate_tables() self._validate_tables()
@ -310,6 +312,7 @@ class Lemmatizer(Pipe):
DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
""" """
serialize = {} serialize = {}
serialize["vocab"] = self.vocab.to_bytes
serialize["lookups"] = self.lookups.to_bytes serialize["lookups"] = self.lookups.to_bytes
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
@ -325,6 +328,7 @@ class Lemmatizer(Pipe):
DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
""" """
deserialize = {} deserialize = {}
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b) deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
self._validate_tables() self._validate_tables()

View File

@ -95,7 +95,6 @@ class Morphologizer(Tagger):
# add mappings for empty morph # add mappings for empty morph
self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""] self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
self._added_strings = set()
@property @property
def labels(self): def labels(self):
@ -129,7 +128,6 @@ class Morphologizer(Tagger):
label_dict.pop(self.POS_FEAT) label_dict.pop(self.POS_FEAT)
# normalize morph string and add to morphology table # normalize morph string and add to morphology table
norm_morph = self.vocab.strings[self.vocab.morphology.add(label_dict)] norm_morph = self.vocab.strings[self.vocab.morphology.add(label_dict)]
self.add_string(norm_morph)
# add label mappings # add label mappings
if norm_label not in self.cfg["labels_morph"]: if norm_label not in self.cfg["labels_morph"]:
self.cfg["labels_morph"][norm_label] = norm_morph self.cfg["labels_morph"][norm_label] = norm_morph
@ -161,7 +159,6 @@ class Morphologizer(Tagger):
if pos: if pos:
morph_dict[self.POS_FEAT] = pos morph_dict[self.POS_FEAT] = pos
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)] norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
self.add_string(norm_label)
# add label->morph and label->POS mappings # add label->morph and label->POS mappings
if norm_label not in self.cfg["labels_morph"]: if norm_label not in self.cfg["labels_morph"]:
self.cfg["labels_morph"][norm_label] = morph self.cfg["labels_morph"][norm_label] = morph
@ -179,7 +176,6 @@ class Morphologizer(Tagger):
if pos: if pos:
morph_dict[self.POS_FEAT] = pos morph_dict[self.POS_FEAT] = pos
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)] norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
self.add_string(norm_label)
gold_array.append([1.0 if label == norm_label else 0.0 for label in self.labels]) gold_array.append([1.0 if label == norm_label else 0.0 for label in self.labels])
doc_sample.append(example.x) doc_sample.append(example.x)
label_sample.append(self.model.ops.asarray(gold_array, dtype="float32")) label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
@ -238,7 +234,6 @@ class Morphologizer(Tagger):
if pos: if pos:
label_dict[self.POS_FEAT] = pos label_dict[self.POS_FEAT] = pos
label = self.vocab.strings[self.vocab.morphology.add(label_dict)] label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
self.add_string(label)
eg_truths.append(label) eg_truths.append(label)
truths.append(eg_truths) truths.append(eg_truths)
d_scores, loss = loss_func(scores, truths) d_scores, loss = loss_func(scores, truths)

View File

@ -61,7 +61,6 @@ class SentenceRecognizer(Tagger):
self.name = name self.name = name
self._rehearsal_model = None self._rehearsal_model = None
self.cfg = {} self.cfg = {}
self._added_strings = set()
@property @property
def labels(self): def labels(self):

View File

@ -78,7 +78,6 @@ class Tagger(TrainablePipe):
self._rehearsal_model = None self._rehearsal_model = None
cfg = {"labels": labels or []} cfg = {"labels": labels or []}
self.cfg = dict(sorted(cfg.items())) self.cfg = dict(sorted(cfg.items()))
self._added_strings = set()
@property @property
def labels(self): def labels(self):
@ -313,7 +312,7 @@ class Tagger(TrainablePipe):
return 0 return 0
self._allow_extra_label() self._allow_extra_label()
self.cfg["labels"].append(label) self.cfg["labels"].append(label)
self.add_string(label) self.vocab.strings.add(label)
return 1 return 1
def score(self, examples, **kwargs): def score(self, examples, **kwargs):

View File

@ -110,7 +110,6 @@ class TextCategorizer(TrainablePipe):
self._rehearsal_model = None self._rehearsal_model = None
cfg = {"labels": [], "threshold": threshold, "positive_label": None} cfg = {"labels": [], "threshold": threshold, "positive_label": None}
self.cfg = dict(cfg) self.cfg = dict(cfg)
self._added_strings = set()
@property @property
def labels(self) -> Tuple[str]: def labels(self) -> Tuple[str]:
@ -301,7 +300,7 @@ class TextCategorizer(TrainablePipe):
return 0 return 0
self._allow_extra_label() self._allow_extra_label()
self.cfg["labels"].append(label) self.cfg["labels"].append(label)
self.add_string(label) self.vocab.strings.add(label)
return 1 return 1
def initialize( def initialize(

View File

@ -64,7 +64,6 @@ class Tok2Vec(TrainablePipe):
self.name = name self.name = name
self.listeners = [] self.listeners = []
self.cfg = {} self.cfg = {}
self._added_strings = set()
def add_listener(self, listener: "Tok2VecListener") -> None: def add_listener(self, listener: "Tok2VecListener") -> None:
"""Add a listener for a downstream component. Usually internals.""" """Add a listener for a downstream component. Usually internals."""

View File

@ -5,4 +5,3 @@ cdef class TrainablePipe(Pipe):
cdef public Vocab vocab cdef public Vocab vocab
cdef public object model cdef public object model
cdef public object cfg cdef public object cfg
cdef public set _added_strings

View File

@ -13,6 +13,7 @@ from ..vocab import Vocab
from ..language import Language from ..language import Language
from ..training import Example from ..training import Example
cdef class TrainablePipe(Pipe): cdef class TrainablePipe(Pipe):
"""This class is a base class and not instantiated directly. Trainable """This class is a base class and not instantiated directly. Trainable
pipeline components like the EntityRecognizer or TextCategorizer inherit pipeline components like the EntityRecognizer or TextCategorizer inherit
@ -35,7 +36,6 @@ cdef class TrainablePipe(Pipe):
self.model = model self.model = model
self.name = name self.name = name
self.cfg = dict(cfg) self.cfg = dict(cfg)
self._added_strings = set()
def __call__(self, Doc doc) -> Doc: def __call__(self, Doc doc) -> Doc:
"""Apply the pipe to one document. The document is modified in place, """Apply the pipe to one document. The document is modified in place,
@ -198,10 +198,6 @@ cdef class TrainablePipe(Pipe):
""" """
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name)) raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
def add_string(self, string: str):
self._added_strings.add(string)
return self.vocab.strings.add(string)
@property @property
def is_trainable(self) -> bool: def is_trainable(self) -> bool:
return True return True
@ -244,6 +240,16 @@ cdef class TrainablePipe(Pipe):
""" """
self.model.finish_update(sgd) self.model.finish_update(sgd)
def _validate_serialization_attrs(self):
"""Check that the pipe implements the required attributes. If a subclass
implements a custom __init__ method but doesn't set these attributes,
the currently default to None, so we need to perform additonal checks.
"""
if not hasattr(self, "vocab") or self.vocab is None:
raise ValueError(Errors.E899.format(name=util.get_object_name(self)))
if not hasattr(self, "model") or self.model is None:
raise ValueError(Errors.E898.format(name=util.get_object_name(self)))
def to_bytes(self, *, exclude=tuple()): def to_bytes(self, *, exclude=tuple()):
"""Serialize the pipe to a bytestring. """Serialize the pipe to a bytestring.
@ -252,11 +258,12 @@ cdef class TrainablePipe(Pipe):
DOCS: https://nightly.spacy.io/api/pipe#to_bytes DOCS: https://nightly.spacy.io/api/pipe#to_bytes
""" """
self._validate_serialization_attrs()
serialize = {} serialize = {}
if hasattr(self, "cfg"): if hasattr(self, "cfg") and self.cfg is not None:
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
serialize["vocab"] = self.vocab.to_bytes
serialize["model"] = self.model.to_bytes serialize["model"] = self.model.to_bytes
serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings))
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, *, exclude=tuple()): def from_bytes(self, bytes_data, *, exclude=tuple()):
@ -267,6 +274,7 @@ cdef class TrainablePipe(Pipe):
DOCS: https://nightly.spacy.io/api/pipe#from_bytes DOCS: https://nightly.spacy.io/api/pipe#from_bytes
""" """
self._validate_serialization_attrs()
def load_model(b): def load_model(b):
try: try:
@ -275,9 +283,9 @@ cdef class TrainablePipe(Pipe):
raise ValueError(Errors.E149) from None raise ValueError(Errors.E149) from None
deserialize = {} deserialize = {}
deserialize["strings.json"] = lambda b: [self.add_string(s) for s in srsly.json_loads(b)] if hasattr(self, "cfg") and self.cfg is not None:
if hasattr(self, "cfg"):
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
deserialize["model"] = load_model deserialize["model"] = load_model
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
return self return self
@ -290,10 +298,11 @@ cdef class TrainablePipe(Pipe):
DOCS: https://nightly.spacy.io/api/pipe#to_disk DOCS: https://nightly.spacy.io/api/pipe#to_disk
""" """
self._validate_serialization_attrs()
serialize = {} serialize = {}
if hasattr(self, "cfg"): if hasattr(self, "cfg") and self.cfg is not None:
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings) serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["model"] = lambda p: self.model.to_disk(p) serialize["model"] = lambda p: self.model.to_disk(p)
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
@ -306,6 +315,7 @@ cdef class TrainablePipe(Pipe):
DOCS: https://nightly.spacy.io/api/pipe#from_disk DOCS: https://nightly.spacy.io/api/pipe#from_disk
""" """
self._validate_serialization_attrs()
def load_model(p): def load_model(p):
try: try:
@ -314,9 +324,9 @@ cdef class TrainablePipe(Pipe):
raise ValueError(Errors.E149) from None raise ValueError(Errors.E149) from None
deserialize = {} deserialize = {}
deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)] if hasattr(self, "cfg") and self.cfg is not None:
if hasattr(self, "cfg"):
deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p)) deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
deserialize["model"] = load_model deserialize["model"] = load_model
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
return self return self

View File

@ -76,7 +76,6 @@ cdef class Parser(TrainablePipe):
self.add_multitask_objective(multitask) self.add_multitask_objective(multitask)
self._rehearsal_model = None self._rehearsal_model = None
self._added_strings = set()
def __getnewargs_ex__(self): def __getnewargs_ex__(self):
"""This allows pickling the Parser and its keyword-only init arguments""" """This allows pickling the Parser and its keyword-only init arguments"""
@ -120,7 +119,7 @@ cdef class Parser(TrainablePipe):
resized = True resized = True
if resized: if resized:
self._resize() self._resize()
self.add_string(label) self.vocab.strings.add(label)
return 1 return 1
return 0 return 0
@ -456,24 +455,24 @@ cdef class Parser(TrainablePipe):
def to_disk(self, path, exclude=tuple()): def to_disk(self, path, exclude=tuple()):
serializers = { serializers = {
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True), "model": lambda p: (self.model.to_disk(p) if self.model is not True else True),
'strings.json': lambda p: srsly.write_json(p, self._added_strings), "vocab": lambda p: self.vocab.to_disk(p),
'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]), "moves": lambda p: self.moves.to_disk(p, exclude=["strings"]),
'cfg': lambda p: srsly.write_json(p, self.cfg) "cfg": lambda p: srsly.write_json(p, self.cfg)
} }
util.to_disk(path, serializers, exclude) util.to_disk(path, serializers, exclude)
def from_disk(self, path, exclude=tuple()): def from_disk(self, path, exclude=tuple()):
deserializers = { deserializers = {
'strings.json': lambda p: [self.add_string(s) for s in srsly.read_json(p)], "vocab": lambda p: self.vocab.from_disk(p),
'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]), "moves": lambda p: self.moves.from_disk(p, exclude=["strings"]),
'cfg': lambda p: self.cfg.update(srsly.read_json(p)), "cfg": lambda p: self.cfg.update(srsly.read_json(p)),
'model': lambda p: None, "model": lambda p: None,
} }
util.from_disk(path, deserializers, exclude) util.from_disk(path, deserializers, exclude)
if 'model' not in exclude: if "model" not in exclude:
path = util.ensure_path(path) path = util.ensure_path(path)
with (path / 'model').open('rb') as file_: with (path / "model").open("rb") as file_:
bytes_data = file_.read() bytes_data = file_.read()
try: try:
self._resize() self._resize()
@ -485,7 +484,7 @@ cdef class Parser(TrainablePipe):
def to_bytes(self, exclude=tuple()): def to_bytes(self, exclude=tuple()):
serializers = { serializers = {
"model": lambda: (self.model.to_bytes()), "model": lambda: (self.model.to_bytes()),
"strings.json": lambda: srsly.json_dumps(sorted(self._added_strings)), "vocab": lambda: self.vocab.to_bytes(),
"moves": lambda: self.moves.to_bytes(exclude=["strings"]), "moves": lambda: self.moves.to_bytes(exclude=["strings"]),
"cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True) "cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)
} }
@ -493,7 +492,7 @@ cdef class Parser(TrainablePipe):
def from_bytes(self, bytes_data, exclude=tuple()): def from_bytes(self, bytes_data, exclude=tuple()):
deserializers = { deserializers = {
"strings.json": lambda b: [self.add_string(s) for s in srsly.json_loads(b)], "vocab": lambda b: self.vocab.from_bytes(b),
"moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]), "moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]),
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)), "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
"model": lambda b: None, "model": lambda b: None,

View File

@ -121,9 +121,7 @@ def test_kb_default(nlp):
def test_kb_custom_length(nlp): def test_kb_custom_length(nlp):
"""Test that the default (empty) KB can be configured with a custom entity length""" """Test that the default (empty) KB can be configured with a custom entity length"""
entity_linker = nlp.add_pipe( entity_linker = nlp.add_pipe("entity_linker", config={"entity_vector_length": 35})
"entity_linker", config={"entity_vector_length": 35}
)
assert len(entity_linker.kb) == 0 assert len(entity_linker.kb) == 0
assert entity_linker.kb.get_size_entities() == 0 assert entity_linker.kb.get_size_entities() == 0
assert entity_linker.kb.get_size_aliases() == 0 assert entity_linker.kb.get_size_aliases() == 0
@ -213,16 +211,11 @@ def test_el_pipe_configuration(nlp):
kb = KnowledgeBase(vocab, entity_vector_length=1) kb = KnowledgeBase(vocab, entity_vector_length=1)
kb.add_entity(entity="Q2", freq=12, entity_vector=[2]) kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
kb.add_entity(entity="Q3", freq=5, entity_vector=[3]) kb.add_entity(entity="Q3", freq=5, entity_vector=[3])
kb.add_alias( kb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]
)
return kb return kb
# run an EL pipe without a trained context encoder, to check the candidate generation step only # run an EL pipe without a trained context encoder, to check the candidate generation step only
entity_linker = nlp.add_pipe( entity_linker = nlp.add_pipe("entity_linker", config={"incl_context": False},)
"entity_linker",
config={"incl_context": False},
)
entity_linker.set_kb(create_kb) entity_linker.set_kb(create_kb)
# With the default get_candidates function, matching is case-sensitive # With the default get_candidates function, matching is case-sensitive
text = "Douglas and douglas are not the same." text = "Douglas and douglas are not the same."
@ -453,14 +446,10 @@ def test_overfitting_IO():
return mykb return mykb
# Create the Entity Linker component and add it to the pipeline # Create the Entity Linker component and add it to the pipeline
entity_linker = nlp.add_pipe( entity_linker = nlp.add_pipe("entity_linker", last=True,)
"entity_linker",
last=True,
)
entity_linker.set_kb(create_kb) entity_linker.set_kb(create_kb)
assert "Q2146908" in entity_linker.vocab.strings assert "Q2146908" in entity_linker.vocab.strings
assert "Q2146908" in entity_linker.kb.vocab.strings assert "Q2146908" in entity_linker.kb.vocab.strings
assert "Q2146908" in entity_linker.kb._added_strings
# train the NEL pipe # train the NEL pipe
optimizer = nlp.initialize(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)

View File

@ -101,4 +101,3 @@ def test_overfitting_IO():
doc2 = nlp2(test_text) doc2 = nlp2(test_text)
assert [str(t.morph) for t in doc2] == gold_morphs assert [str(t.morph) for t in doc2] == gold_morphs
assert [t.pos_ for t in doc2] == gold_pos_tags assert [t.pos_ for t in doc2] == gold_pos_tags
assert nlp.get_pipe("morphologizer")._added_strings == nlp2.get_pipe("morphologizer")._added_strings

View File

@ -80,4 +80,3 @@ def test_overfitting_IO():
nlp2 = util.load_model_from_path(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2(test_text) doc2 = nlp2(test_text)
assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts
assert nlp.get_pipe("senter")._added_strings == nlp2.get_pipe("senter")._added_strings

View File

@ -98,7 +98,6 @@ def test_overfitting_IO():
losses = {} losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses) nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["tagger"] < 0.00001 assert losses["tagger"] < 0.00001
assert tagger._added_strings == {"J", "N", "V"}
# test the trained model # test the trained model
test_text = "I like blue eggs" test_text = "I like blue eggs"
@ -117,7 +116,6 @@ def test_overfitting_IO():
assert doc2[1].tag_ is "V" assert doc2[1].tag_ is "V"
assert doc2[2].tag_ is "J" assert doc2[2].tag_ is "J"
assert doc2[3].tag_ is "N" assert doc2[3].tag_ is "N"
assert nlp2.get_pipe("tagger")._added_strings == {"J", "N", "V"}
def test_tagger_requires_labels(): def test_tagger_requires_labels():

View File

@ -146,7 +146,6 @@ def test_overfitting_IO():
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
optimizer = nlp.initialize(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)
assert textcat.model.get_dim("nO") == 2 assert textcat.model.get_dim("nO") == 2
assert textcat._added_strings == {"NEGATIVE", "POSITIVE"}
for i in range(50): for i in range(50):
losses = {} losses = {}
@ -168,7 +167,6 @@ def test_overfitting_IO():
cats2 = doc2.cats cats2 = doc2.cats
assert cats2["POSITIVE"] > 0.9 assert cats2["POSITIVE"] > 0.9
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001) assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001)
assert nlp2.get_pipe("textcat")._added_strings == {"NEGATIVE", "POSITIVE"}
# Test scoring # Test scoring
scores = nlp.evaluate(train_examples) scores = nlp.evaluate(train_examples)

View File

@ -7,6 +7,7 @@ from spacy.kb import KnowledgeBase, Writer
from spacy.vectors import Vectors from spacy.vectors import Vectors
from spacy.language import Language from spacy.language import Language
from spacy.pipeline import TrainablePipe from spacy.pipeline import TrainablePipe
from spacy.vocab import Vocab
from ..util import make_tempdir from ..util import make_tempdir
@ -50,8 +51,9 @@ def custom_pipe():
else: else:
self.cfg = None self.cfg = None
self.model = SerializableDummy() self.model = SerializableDummy()
self.vocab = vocab
return MyPipe(None) return MyPipe(Vocab())
def tagger(): def tagger():

View File

@ -1,13 +1,13 @@
import pytest import pytest
import srsly
from spacy import registry, Vocab from spacy import registry, Vocab
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
from spacy.pipeline import TextCategorizer, SentenceRecognizer from spacy.pipeline import TextCategorizer, SentenceRecognizer, TrainablePipe
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
from spacy.pipeline.senter import DEFAULT_SENTER_MODEL from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
from spacy.lang.en import English from spacy.lang.en import English
from thinc.api import Linear
import spacy import spacy
from ..util import make_tempdir from ..util import make_tempdir
@ -89,7 +89,6 @@ def test_serialize_parser_strings(Parser):
assert label not in vocab2.strings assert label not in vocab2.strings
parser2 = Parser(vocab2, model, **config) parser2 = Parser(vocab2, model, **config)
parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"])) parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"]))
assert parser1._added_strings == parser2._added_strings == {"FunnyLabel"}
assert label in parser2.vocab.strings assert label in parser2.vocab.strings
@ -166,17 +165,13 @@ def test_serialize_tagger_strings(en_vocab, de_vocab, taggers):
# check that custom labels are serialized as part of the component's strings.jsonl # check that custom labels are serialized as part of the component's strings.jsonl
tagger.add_label(label) tagger.add_label(label)
assert label in tagger.vocab.strings assert label in tagger.vocab.strings
assert tagger._added_strings == {label}
file_path = d / "tagger1" file_path = d / "tagger1"
tagger.to_disk(file_path) tagger.to_disk(file_path)
strings = srsly.read_json(file_path / "strings.json")
assert strings == ["SomeWeirdLabel"]
# ensure that the custom strings are loaded back in when using the tagger in another pipeline # ensure that the custom strings are loaded back in when using the tagger in another pipeline
cfg = {"model": DEFAULT_TAGGER_MODEL} cfg = {"model": DEFAULT_TAGGER_MODEL}
model = registry.resolve(cfg, validate=True)["model"] model = registry.resolve(cfg, validate=True)["model"]
tagger2 = Tagger(de_vocab, model).from_disk(file_path) tagger2 = Tagger(de_vocab, model).from_disk(file_path)
assert label in tagger2.vocab.strings assert label in tagger2.vocab.strings
assert tagger2._added_strings == {label}
def test_serialize_textcat_empty(en_vocab): def test_serialize_textcat_empty(en_vocab):
@ -253,3 +248,40 @@ def test_serialize_pipeline_disable_enable():
assert nlp5.pipe_names == ["ner"] assert nlp5.pipe_names == ["ner"]
assert nlp5.component_names == ["ner"] assert nlp5.component_names == ["ner"]
assert nlp5.disabled == [] assert nlp5.disabled == []
def test_serialize_custom_trainable_pipe():
class BadCustomPipe1(TrainablePipe):
def __init__(self, vocab):
pass
class BadCustomPipe2(TrainablePipe):
def __init__(self, vocab):
self.vocab = vocab
self.model = None
class CustomPipe(TrainablePipe):
def __init__(self, vocab, model):
self.vocab = vocab
self.model = model
pipe = BadCustomPipe1(Vocab())
with pytest.raises(ValueError):
pipe.to_bytes()
with make_tempdir() as d:
with pytest.raises(ValueError):
pipe.to_disk(d)
pipe = BadCustomPipe2(Vocab())
with pytest.raises(ValueError):
pipe.to_bytes()
with make_tempdir() as d:
with pytest.raises(ValueError):
pipe.to_disk(d)
pipe = CustomPipe(Vocab(), Linear())
pipe_bytes = pipe.to_bytes()
new_pipe = CustomPipe(Vocab(), Linear()).from_bytes(pipe_bytes)
assert new_pipe.to_bytes() == pipe_bytes
with make_tempdir() as d:
pipe.to_disk(d)
new_pipe = CustomPipe(Vocab(), Linear()).from_disk(d)
assert new_pipe.to_bytes() == pipe_bytes

View File

@ -821,7 +821,7 @@ def get_object_name(obj: Any) -> str:
obj (Any): The Python object, typically a function or class. obj (Any): The Python object, typically a function or class.
RETURNS (str): A human-readable name. RETURNS (str): A human-readable name.
""" """
if hasattr(obj, "name"): if hasattr(obj, "name") and obj.name is not None:
return obj.name return obj.name
if hasattr(obj, "__name__"): if hasattr(obj, "__name__"):
return obj.__name__ return obj.__name__