mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
TrainablePipe (#6213)
* rename Pipe to TrainablePipe * split functionality between Pipe and TrainablePipe * remove unnecessary methods from certain components * cleanup * hasattr(component, "pipe") should be sufficient again * remove serialization and vocab/cfg from Pipe * unify _ensure_examples and validate_examples * small fixes * hasattr checks for self.cfg and self.vocab * make is_resizable and is_trainable properties * serialize strings.json instead of vocab * fix KB IO + tests * fix typos * more typos * _added_strings as a set * few more tests specifically for _added_strings field * bump to 3.0.0a36
This commit is contained in:
parent
5ebd1fc2cf
commit
d093d6343b
1
setup.py
1
setup.py
|
@ -37,6 +37,7 @@ MOD_NAMES = [
|
||||||
"spacy.pipeline.multitask",
|
"spacy.pipeline.multitask",
|
||||||
"spacy.pipeline.ner",
|
"spacy.pipeline.ner",
|
||||||
"spacy.pipeline.pipe",
|
"spacy.pipeline.pipe",
|
||||||
|
"spacy.pipeline.trainable_pipe",
|
||||||
"spacy.pipeline.sentencizer",
|
"spacy.pipeline.sentencizer",
|
||||||
"spacy.pipeline.senter",
|
"spacy.pipeline.senter",
|
||||||
"spacy.pipeline.tagger",
|
"spacy.pipeline.tagger",
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a35"
|
__version__ = "3.0.0a36"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -522,14 +522,12 @@ class Errors:
|
||||||
E928 = ("A KnowledgeBase can only be serialized to/from from a directory, "
|
E928 = ("A KnowledgeBase can only be serialized to/from from a directory, "
|
||||||
"but the provided argument {loc} points to a file.")
|
"but the provided argument {loc} points to a file.")
|
||||||
E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.")
|
E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.")
|
||||||
E930 = ("Received invalid get_examples callback in `{name}.initialize`. "
|
E930 = ("Received invalid get_examples callback in `{method}`. "
|
||||||
"Expected function that returns an iterable of Example objects but "
|
"Expected function that returns an iterable of Example objects but "
|
||||||
"got: {obj}")
|
"got: {obj}")
|
||||||
E931 = ("Encountered Pipe subclass without `Pipe.{method}` method in component "
|
E931 = ("Encountered {parent} subclass without `{parent}.{method}` "
|
||||||
"'{name}'. If the component is trainable and you want to use this "
|
"method in component '{name}'. If you want to use this "
|
||||||
"method, make sure it's overwritten on the subclass. If your "
|
"method, make sure it's overwritten on the subclass.")
|
||||||
"component isn't trainable, add a method that does nothing or "
|
|
||||||
"don't use the Pipe base class.")
|
|
||||||
E940 = ("Found NaN values in scores.")
|
E940 = ("Found NaN values in scores.")
|
||||||
E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
|
E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
|
||||||
"model from a shortcut, which is deprecated as of spaCy v3.0. To "
|
"model from a shortcut, which is deprecated as of spaCy v3.0. To "
|
||||||
|
|
|
@ -30,6 +30,7 @@ cdef class KnowledgeBase:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cpdef readonly Vocab vocab
|
cpdef readonly Vocab vocab
|
||||||
cdef int64_t entity_vector_length
|
cdef int64_t entity_vector_length
|
||||||
|
cdef public set _added_strings
|
||||||
|
|
||||||
# This maps 64bit keys (hash of unique entity string)
|
# This maps 64bit keys (hash of unique entity string)
|
||||||
# to 64bit values (position of the _KBEntryC struct in the _entries vector).
|
# to 64bit values (position of the _KBEntryC struct in the _entries vector).
|
||||||
|
|
49
spacy/kb.pyx
49
spacy/kb.pyx
|
@ -1,5 +1,7 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
from typing import Iterator
|
from typing import Iterator, Iterable
|
||||||
|
|
||||||
|
import srsly
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from cpython.exc cimport PyErr_SetFromErrno
|
from cpython.exc cimport PyErr_SetFromErrno
|
||||||
|
@ -10,13 +12,10 @@ from libcpp.vector cimport vector
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from spacy.strings import StringStore
|
|
||||||
|
|
||||||
from spacy import util
|
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
|
from . import util
|
||||||
|
from .util import SimpleFrozenList, ensure_path
|
||||||
|
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||||
|
@ -85,9 +84,6 @@ cdef class KnowledgeBase:
|
||||||
DOCS: https://nightly.spacy.io/api/kb
|
DOCS: https://nightly.spacy.io/api/kb
|
||||||
"""
|
"""
|
||||||
|
|
||||||
contents_loc = "contents"
|
|
||||||
strings_loc = "strings.json"
|
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, entity_vector_length):
|
def __init__(self, Vocab vocab, entity_vector_length):
|
||||||
"""Create a KnowledgeBase."""
|
"""Create a KnowledgeBase."""
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
@ -95,8 +91,8 @@ cdef class KnowledgeBase:
|
||||||
self._entry_index = PreshMap()
|
self._entry_index = PreshMap()
|
||||||
self._alias_index = PreshMap()
|
self._alias_index = PreshMap()
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.vocab.strings.add("")
|
|
||||||
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
|
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
|
||||||
|
self._added_strings = set()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def entity_vector_length(self):
|
def entity_vector_length(self):
|
||||||
|
@ -118,12 +114,16 @@ cdef class KnowledgeBase:
|
||||||
def get_alias_strings(self):
|
def get_alias_strings(self):
|
||||||
return [self.vocab.strings[x] for x in self._alias_index]
|
return [self.vocab.strings[x] for x in self._alias_index]
|
||||||
|
|
||||||
|
def add_string(self, string: str):
|
||||||
|
self._added_strings.add(string)
|
||||||
|
return self.vocab.strings.add(string)
|
||||||
|
|
||||||
def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
|
def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
|
||||||
"""
|
"""
|
||||||
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
||||||
Return the hash of the entity ID/name at the end.
|
Return the hash of the entity ID/name at the end.
|
||||||
"""
|
"""
|
||||||
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
cdef hash_t entity_hash = self.add_string(entity)
|
||||||
|
|
||||||
# Return if this entity was added before
|
# Return if this entity was added before
|
||||||
if entity_hash in self._entry_index:
|
if entity_hash in self._entry_index:
|
||||||
|
@ -157,7 +157,7 @@ cdef class KnowledgeBase:
|
||||||
cdef hash_t entity_hash
|
cdef hash_t entity_hash
|
||||||
while i < len(entity_list):
|
while i < len(entity_list):
|
||||||
# only process this entity if its unique ID hadn't been added before
|
# only process this entity if its unique ID hadn't been added before
|
||||||
entity_hash = self.vocab.strings.add(entity_list[i])
|
entity_hash = self.add_string(entity_list[i])
|
||||||
if entity_hash in self._entry_index:
|
if entity_hash in self._entry_index:
|
||||||
warnings.warn(Warnings.W018.format(entity=entity_list[i]))
|
warnings.warn(Warnings.W018.format(entity=entity_list[i]))
|
||||||
|
|
||||||
|
@ -203,7 +203,7 @@ cdef class KnowledgeBase:
|
||||||
if prob_sum > 1.00001:
|
if prob_sum > 1.00001:
|
||||||
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
|
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
|
||||||
|
|
||||||
cdef hash_t alias_hash = self.vocab.strings.add(alias)
|
cdef hash_t alias_hash = self.add_string(alias)
|
||||||
|
|
||||||
# Check whether this alias was added before
|
# Check whether this alias was added before
|
||||||
if alias_hash in self._alias_index:
|
if alias_hash in self._alias_index:
|
||||||
|
@ -324,26 +324,27 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
def to_disk(self, path):
|
def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||||
path = util.ensure_path(path)
|
path = ensure_path(path)
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
path.mkdir(parents=True)
|
path.mkdir(parents=True)
|
||||||
if not path.is_dir():
|
if not path.is_dir():
|
||||||
raise ValueError(Errors.E928.format(loc=path))
|
raise ValueError(Errors.E928.format(loc=path))
|
||||||
self.write_contents(path / self.contents_loc)
|
serialize = {}
|
||||||
self.vocab.strings.to_disk(path / self.strings_loc)
|
serialize["contents"] = lambda p: self.write_contents(p)
|
||||||
|
serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings)
|
||||||
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path):
|
def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||||
path = util.ensure_path(path)
|
path = ensure_path(path)
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
raise ValueError(Errors.E929.format(loc=path))
|
raise ValueError(Errors.E929.format(loc=path))
|
||||||
if not path.is_dir():
|
if not path.is_dir():
|
||||||
raise ValueError(Errors.E928.format(loc=path))
|
raise ValueError(Errors.E928.format(loc=path))
|
||||||
self.read_contents(path / self.contents_loc)
|
deserialize = {}
|
||||||
kb_strings = StringStore()
|
deserialize["contents"] = lambda p: self.read_contents(p)
|
||||||
kb_strings.from_disk(path / self.strings_loc)
|
deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)]
|
||||||
for string in kb_strings:
|
util.from_disk(path, deserialize, exclude)
|
||||||
self.vocab.strings.add(string)
|
|
||||||
|
|
||||||
def write_contents(self, file_path):
|
def write_contents(self, file_path):
|
||||||
cdef Writer writer = Writer(file_path)
|
cdef Writer writer = Writer(file_path)
|
||||||
|
|
|
@ -20,7 +20,7 @@ from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
||||||
from .training import Example, validate_examples
|
from .training import Example, validate_examples
|
||||||
from .training.initialize import init_vocab, init_tok2vec
|
from .training.initialize import init_vocab, init_tok2vec
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .util import registry, SimpleFrozenList
|
from .util import registry, SimpleFrozenList, _pipe
|
||||||
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
|
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
|
||||||
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
||||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
@ -1095,7 +1095,7 @@ class Language:
|
||||||
if (
|
if (
|
||||||
name not in exclude
|
name not in exclude
|
||||||
and hasattr(proc, "is_trainable")
|
and hasattr(proc, "is_trainable")
|
||||||
and proc.is_trainable()
|
and proc.is_trainable
|
||||||
and proc.model not in (True, False, None)
|
and proc.model not in (True, False, None)
|
||||||
):
|
):
|
||||||
proc.finish_update(sgd)
|
proc.finish_update(sgd)
|
||||||
|
@ -1194,8 +1194,8 @@ class Language:
|
||||||
doc = Doc(self.vocab, words=["x", "y", "z"])
|
doc = Doc(self.vocab, words=["x", "y", "z"])
|
||||||
get_examples = lambda: [Example.from_dict(doc, {})]
|
get_examples = lambda: [Example.from_dict(doc, {})]
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
err = Errors.E930.format(name="Language", obj=type(get_examples))
|
err = Errors.E930.format(method="Language.initialize", obj=type(get_examples))
|
||||||
raise ValueError(err)
|
raise TypeError(err)
|
||||||
# Make sure the config is interpolated so we can resolve subsections
|
# Make sure the config is interpolated so we can resolve subsections
|
||||||
config = self.config.interpolate()
|
config = self.config.interpolate()
|
||||||
# These are the settings provided in the [initialize] block in the config
|
# These are the settings provided in the [initialize] block in the config
|
||||||
|
@ -1301,16 +1301,7 @@ class Language:
|
||||||
for name, pipe in self.pipeline:
|
for name, pipe in self.pipeline:
|
||||||
kwargs = component_cfg.get(name, {})
|
kwargs = component_cfg.get(name, {})
|
||||||
kwargs.setdefault("batch_size", batch_size)
|
kwargs.setdefault("batch_size", batch_size)
|
||||||
# non-trainable components may have a pipe() implementation that refers to dummy
|
docs = _pipe(docs, pipe, kwargs)
|
||||||
# predict and set_annotations methods
|
|
||||||
if (
|
|
||||||
not hasattr(pipe, "pipe")
|
|
||||||
or not hasattr(pipe, "is_trainable")
|
|
||||||
or not pipe.is_trainable()
|
|
||||||
):
|
|
||||||
docs = _pipe(docs, pipe, kwargs)
|
|
||||||
else:
|
|
||||||
docs = pipe.pipe(docs, **kwargs)
|
|
||||||
# iterate over the final generator
|
# iterate over the final generator
|
||||||
if len(self.pipeline):
|
if len(self.pipeline):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
|
@ -1417,17 +1408,7 @@ class Language:
|
||||||
kwargs = component_cfg.get(name, {})
|
kwargs = component_cfg.get(name, {})
|
||||||
# Allow component_cfg to overwrite the top-level kwargs.
|
# Allow component_cfg to overwrite the top-level kwargs.
|
||||||
kwargs.setdefault("batch_size", batch_size)
|
kwargs.setdefault("batch_size", batch_size)
|
||||||
# non-trainable components may have a pipe() implementation that refers to dummy
|
f = functools.partial(_pipe, proc=proc, kwargs=kwargs)
|
||||||
# predict and set_annotations methods
|
|
||||||
if (
|
|
||||||
hasattr(proc, "pipe")
|
|
||||||
and hasattr(proc, "is_trainable")
|
|
||||||
and proc.is_trainable()
|
|
||||||
):
|
|
||||||
f = functools.partial(proc.pipe, **kwargs)
|
|
||||||
else:
|
|
||||||
# Apply the function, but yield the doc
|
|
||||||
f = functools.partial(_pipe, proc=proc, kwargs=kwargs)
|
|
||||||
pipes.append(f)
|
pipes.append(f)
|
||||||
|
|
||||||
if n_process != 1:
|
if n_process != 1:
|
||||||
|
@ -1826,19 +1807,6 @@ class DisabledPipes(list):
|
||||||
self[:] = []
|
self[:] = []
|
||||||
|
|
||||||
|
|
||||||
def _pipe(
|
|
||||||
examples: Iterable[Example], proc: Callable[[Doc], Doc], kwargs: Dict[str, Any]
|
|
||||||
) -> Iterator[Example]:
|
|
||||||
# We added some args for pipe that __call__ doesn't expect.
|
|
||||||
kwargs = dict(kwargs)
|
|
||||||
for arg in ["batch_size"]:
|
|
||||||
if arg in kwargs:
|
|
||||||
kwargs.pop(arg)
|
|
||||||
for eg in examples:
|
|
||||||
eg = proc(eg, **kwargs)
|
|
||||||
yield eg
|
|
||||||
|
|
||||||
|
|
||||||
def _apply_pipes(
|
def _apply_pipes(
|
||||||
make_doc: Callable[[str], Doc],
|
make_doc: Callable[[str], Doc],
|
||||||
pipes: Iterable[Callable[[Doc], Doc]],
|
pipes: Iterable[Callable[[Doc], Doc]],
|
||||||
|
|
|
@ -6,6 +6,7 @@ from .entityruler import EntityRuler
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .morphologizer import Morphologizer
|
from .morphologizer import Morphologizer
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
|
from .trainable_pipe import TrainablePipe
|
||||||
from .senter import SentenceRecognizer
|
from .senter import SentenceRecognizer
|
||||||
from .sentencizer import Sentencizer
|
from .sentencizer import Sentencizer
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
|
@ -21,6 +22,7 @@ __all__ = [
|
||||||
"EntityRuler",
|
"EntityRuler",
|
||||||
"Morphologizer",
|
"Morphologizer",
|
||||||
"Lemmatizer",
|
"Lemmatizer",
|
||||||
|
"TrainablePipe",
|
||||||
"Pipe",
|
"Pipe",
|
||||||
"SentenceRecognizer",
|
"SentenceRecognizer",
|
||||||
"Sentencizer",
|
"Sentencizer",
|
||||||
|
|
|
@ -57,6 +57,7 @@ class AttributeRuler(Pipe):
|
||||||
self.attrs = []
|
self.attrs = []
|
||||||
self._attrs_unnormed = [] # store for reference
|
self._attrs_unnormed = [] # store for reference
|
||||||
self.indices = []
|
self.indices = []
|
||||||
|
self._added_strings = set()
|
||||||
|
|
||||||
def clear(self) -> None:
|
def clear(self) -> None:
|
||||||
"""Reset all patterns."""
|
"""Reset all patterns."""
|
||||||
|
@ -123,21 +124,6 @@ class AttributeRuler(Pipe):
|
||||||
set_token_attrs(span[index], attrs)
|
set_token_attrs(span[index], attrs)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
|
||||||
"""Apply the pipe to a stream of documents. This usually happens under
|
|
||||||
the hood when the nlp object is called on a text and all components are
|
|
||||||
applied to the Doc.
|
|
||||||
|
|
||||||
stream (Iterable[Doc]): A stream of documents.
|
|
||||||
batch_size (int): The number of documents to buffer.
|
|
||||||
YIELDS (Doc): Processed documents in order.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/attributeruler/pipe#pipe
|
|
||||||
"""
|
|
||||||
for doc in stream:
|
|
||||||
doc = self(doc)
|
|
||||||
yield doc
|
|
||||||
|
|
||||||
def load_from_tag_map(
|
def load_from_tag_map(
|
||||||
self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]]
|
self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]]
|
||||||
) -> None:
|
) -> None:
|
||||||
|
@ -201,12 +187,16 @@ class AttributeRuler(Pipe):
|
||||||
# We need to make a string here, because otherwise the ID we pass back
|
# We need to make a string here, because otherwise the ID we pass back
|
||||||
# will be interpreted as the hash of a string, rather than an ordinal.
|
# will be interpreted as the hash of a string, rather than an ordinal.
|
||||||
key = str(len(self.attrs))
|
key = str(len(self.attrs))
|
||||||
self.matcher.add(self.vocab.strings.add(key), patterns)
|
self.matcher.add(self.add_string(key), patterns)
|
||||||
self._attrs_unnormed.append(attrs)
|
self._attrs_unnormed.append(attrs)
|
||||||
attrs = normalize_token_attrs(self.vocab, attrs)
|
attrs = normalize_token_attrs(self.vocab, attrs)
|
||||||
self.attrs.append(attrs)
|
self.attrs.append(attrs)
|
||||||
self.indices.append(index)
|
self.indices.append(index)
|
||||||
|
|
||||||
|
def add_string(self, string: str):
|
||||||
|
self._added_strings.add(string)
|
||||||
|
return self.vocab.strings.add(string)
|
||||||
|
|
||||||
def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None:
|
def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None:
|
||||||
"""Add patterns from a list of pattern dicts with the keys as the
|
"""Add patterns from a list of pattern dicts with the keys as the
|
||||||
arguments to AttributeRuler.add.
|
arguments to AttributeRuler.add.
|
||||||
|
@ -266,8 +256,8 @@ class AttributeRuler(Pipe):
|
||||||
DOCS: https://nightly.spacy.io/api/attributeruler#to_bytes
|
DOCS: https://nightly.spacy.io/api/attributeruler#to_bytes
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
|
||||||
serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns)
|
serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns)
|
||||||
|
serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings))
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(
|
def from_bytes(
|
||||||
|
@ -286,7 +276,7 @@ class AttributeRuler(Pipe):
|
||||||
self.add_patterns(srsly.msgpack_loads(b))
|
self.add_patterns(srsly.msgpack_loads(b))
|
||||||
|
|
||||||
deserialize = {
|
deserialize = {
|
||||||
"vocab": lambda b: self.vocab.from_bytes(b),
|
"strings.json": lambda b: [self.add_string(s) for s in srsly.json_loads(b)],
|
||||||
"patterns": load_patterns,
|
"patterns": load_patterns,
|
||||||
}
|
}
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
|
@ -303,7 +293,7 @@ class AttributeRuler(Pipe):
|
||||||
DOCS: https://nightly.spacy.io/api/attributeruler#to_disk
|
DOCS: https://nightly.spacy.io/api/attributeruler#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {
|
serialize = {
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
"strings.json": lambda p: srsly.write_json(p, self._added_strings),
|
||||||
"patterns": lambda p: srsly.write_msgpack(p, self.patterns),
|
"patterns": lambda p: srsly.write_msgpack(p, self.patterns),
|
||||||
}
|
}
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
@ -324,7 +314,7 @@ class AttributeRuler(Pipe):
|
||||||
self.add_patterns(srsly.read_msgpack(p))
|
self.add_patterns(srsly.read_msgpack(p))
|
||||||
|
|
||||||
deserialize = {
|
deserialize = {
|
||||||
"vocab": lambda p: self.vocab.from_disk(p),
|
"strings.json": lambda p: [self.add_string(s) for s in srsly.read_json(p)],
|
||||||
"patterns": load_patterns,
|
"patterns": load_patterns,
|
||||||
}
|
}
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
|
|
|
@ -10,10 +10,11 @@ import warnings
|
||||||
from ..kb import KnowledgeBase, Candidate
|
from ..kb import KnowledgeBase, Candidate
|
||||||
from ..ml import empty_kb
|
from ..ml import empty_kb
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from .pipe import Pipe, deserialize_config
|
from .pipe import deserialize_config
|
||||||
|
from .trainable_pipe import TrainablePipe
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..training import Example, validate_examples
|
from ..training import Example, validate_examples, validate_get_examples
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..util import SimpleFrozenList
|
from ..util import SimpleFrozenList
|
||||||
from .. import util
|
from .. import util
|
||||||
|
@ -90,7 +91,7 @@ def make_entity_linker(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class EntityLinker(Pipe):
|
class EntityLinker(TrainablePipe):
|
||||||
"""Pipeline component for named entity linking.
|
"""Pipeline component for named entity linking.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/entitylinker
|
DOCS: https://nightly.spacy.io/api/entitylinker
|
||||||
|
@ -172,7 +173,7 @@ class EntityLinker(Pipe):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/entitylinker#initialize
|
DOCS: https://nightly.spacy.io/api/entitylinker#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
validate_get_examples(get_examples, "EntityLinker.initialize")
|
||||||
if kb_loader is not None:
|
if kb_loader is not None:
|
||||||
self.set_kb(kb_loader)
|
self.set_kb(kb_loader)
|
||||||
self.validate_kb()
|
self.validate_kb()
|
||||||
|
@ -453,7 +454,6 @@ class EntityLinker(Pipe):
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
|
||||||
serialize["kb"] = lambda p: self.kb.to_disk(p)
|
serialize["kb"] = lambda p: self.kb.to_disk(p)
|
||||||
serialize["model"] = lambda p: self.model.to_disk(p)
|
serialize["model"] = lambda p: self.model.to_disk(p)
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
@ -477,11 +477,12 @@ class EntityLinker(Pipe):
|
||||||
raise ValueError(Errors.E149) from None
|
raise ValueError(Errors.E149) from None
|
||||||
|
|
||||||
deserialize = {}
|
deserialize = {}
|
||||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
|
||||||
deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
|
deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
|
||||||
deserialize["kb"] = lambda p: self.kb.from_disk(p)
|
deserialize["kb"] = lambda p: self.kb.from_disk(p)
|
||||||
deserialize["model"] = load_model
|
deserialize["model"] = load_model
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
|
for s in self.kb._added_strings:
|
||||||
|
self.vocab.strings.add(s)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def rehearse(self, examples, *, sgd=None, losses=None, **config):
|
def rehearse(self, examples, *, sgd=None, losses=None, **config):
|
||||||
|
|
|
@ -342,12 +342,6 @@ class EntityRuler(Pipe):
|
||||||
validate_examples(examples, "EntityRuler.score")
|
validate_examples(examples, "EntityRuler.score")
|
||||||
return Scorer.score_spans(examples, "ents", **kwargs)
|
return Scorer.score_spans(examples, "ents", **kwargs)
|
||||||
|
|
||||||
def predict(self, docs):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def set_annotations(self, docs, scores):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def from_bytes(
|
def from_bytes(
|
||||||
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
) -> "EntityRuler":
|
) -> "EntityRuler":
|
||||||
|
|
|
@ -281,7 +281,6 @@ class Lemmatizer(Pipe):
|
||||||
DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
|
DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
|
||||||
serialize["lookups"] = lambda p: self.lookups.to_disk(p)
|
serialize["lookups"] = lambda p: self.lookups.to_disk(p)
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
|
@ -297,7 +296,6 @@ class Lemmatizer(Pipe):
|
||||||
DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
|
DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
|
||||||
"""
|
"""
|
||||||
deserialize = {}
|
deserialize = {}
|
||||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
|
||||||
deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
|
deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
self._validate_tables()
|
self._validate_tables()
|
||||||
|
@ -312,7 +310,6 @@ class Lemmatizer(Pipe):
|
||||||
DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
|
DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
serialize = {}
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
|
||||||
serialize["lookups"] = self.lookups.to_bytes
|
serialize["lookups"] = self.lookups.to_bytes
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
|
@ -328,7 +325,6 @@ class Lemmatizer(Pipe):
|
||||||
DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
|
DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
|
||||||
"""
|
"""
|
||||||
deserialize = {}
|
deserialize = {}
|
||||||
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
|
||||||
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
|
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
self._validate_tables()
|
self._validate_tables()
|
||||||
|
|
|
@ -16,7 +16,7 @@ from .pipe import deserialize_config
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..training import validate_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
|
@ -95,6 +95,7 @@ class Morphologizer(Tagger):
|
||||||
# add mappings for empty morph
|
# add mappings for empty morph
|
||||||
self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
|
self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
|
||||||
self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
|
self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
|
||||||
|
self._added_strings = set()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -128,6 +129,7 @@ class Morphologizer(Tagger):
|
||||||
label_dict.pop(self.POS_FEAT)
|
label_dict.pop(self.POS_FEAT)
|
||||||
# normalize morph string and add to morphology table
|
# normalize morph string and add to morphology table
|
||||||
norm_morph = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
norm_morph = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
||||||
|
self.add_string(norm_morph)
|
||||||
# add label mappings
|
# add label mappings
|
||||||
if norm_label not in self.cfg["labels_morph"]:
|
if norm_label not in self.cfg["labels_morph"]:
|
||||||
self.cfg["labels_morph"][norm_label] = norm_morph
|
self.cfg["labels_morph"][norm_label] = norm_morph
|
||||||
|
@ -144,7 +146,7 @@ class Morphologizer(Tagger):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/morphologizer#initialize
|
DOCS: https://nightly.spacy.io/api/morphologizer#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
validate_get_examples(get_examples, "Morphologizer.initialize")
|
||||||
if labels is not None:
|
if labels is not None:
|
||||||
self.cfg["labels_morph"] = labels["morph"]
|
self.cfg["labels_morph"] = labels["morph"]
|
||||||
self.cfg["labels_pos"] = labels["pos"]
|
self.cfg["labels_pos"] = labels["pos"]
|
||||||
|
@ -159,6 +161,7 @@ class Morphologizer(Tagger):
|
||||||
if pos:
|
if pos:
|
||||||
morph_dict[self.POS_FEAT] = pos
|
morph_dict[self.POS_FEAT] = pos
|
||||||
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
|
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
|
||||||
|
self.add_string(norm_label)
|
||||||
# add label->morph and label->POS mappings
|
# add label->morph and label->POS mappings
|
||||||
if norm_label not in self.cfg["labels_morph"]:
|
if norm_label not in self.cfg["labels_morph"]:
|
||||||
self.cfg["labels_morph"][norm_label] = morph
|
self.cfg["labels_morph"][norm_label] = morph
|
||||||
|
@ -176,6 +179,7 @@ class Morphologizer(Tagger):
|
||||||
if pos:
|
if pos:
|
||||||
morph_dict[self.POS_FEAT] = pos
|
morph_dict[self.POS_FEAT] = pos
|
||||||
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
|
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
|
||||||
|
self.add_string(norm_label)
|
||||||
gold_array.append([1.0 if label == norm_label else 0.0 for label in self.labels])
|
gold_array.append([1.0 if label == norm_label else 0.0 for label in self.labels])
|
||||||
doc_sample.append(example.x)
|
doc_sample.append(example.x)
|
||||||
label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
|
label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
|
||||||
|
@ -234,6 +238,7 @@ class Morphologizer(Tagger):
|
||||||
if pos:
|
if pos:
|
||||||
label_dict[self.POS_FEAT] = pos
|
label_dict[self.POS_FEAT] = pos
|
||||||
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
||||||
|
self.add_string(label)
|
||||||
eg_truths.append(label)
|
eg_truths.append(label)
|
||||||
truths.append(eg_truths)
|
truths.append(eg_truths)
|
||||||
d_scores, loss = loss_func(scores, truths)
|
d_scores, loss = loss_func(scores, truths)
|
||||||
|
|
|
@ -6,7 +6,7 @@ from thinc.api import set_dropout_rate
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .trainable_pipe import TrainablePipe
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
from ..training import validate_examples
|
from ..training import validate_examples
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
@ -164,7 +164,7 @@ class MultitaskObjective(Tagger):
|
||||||
return "I-SENT"
|
return "I-SENT"
|
||||||
|
|
||||||
|
|
||||||
class ClozeMultitask(Pipe):
|
class ClozeMultitask(TrainablePipe):
|
||||||
def __init__(self, vocab, model, **cfg):
|
def __init__(self, vocab, model, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
|
|
@ -1,5 +1,2 @@
|
||||||
cdef class Pipe:
|
cdef class Pipe:
|
||||||
cdef public object vocab
|
|
||||||
cdef public object model
|
|
||||||
cdef public str name
|
cdef public str name
|
||||||
cdef public object cfg
|
|
||||||
|
|
|
@ -1,38 +1,22 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
import warnings
|
import warnings
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple, Iterable, Iterator, Callable, Union, Dict
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import set_dropout_rate, Model
|
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
from ..training import validate_examples
|
from ..training import Example
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from .. import util
|
from ..language import Language
|
||||||
|
|
||||||
|
|
||||||
cdef class Pipe:
|
cdef class Pipe:
|
||||||
"""This class is a base class and not instantiated directly. Trainable
|
"""This class is a base class and not instantiated directly. It provides
|
||||||
pipeline components like the EntityRecognizer or TextCategorizer inherit
|
an interface for pipeline components to implement.
|
||||||
from it and it defines the interface that components should follow to
|
Trainable pipeline components like the EntityRecognizer or TextCategorizer
|
||||||
function as trainable components in a spaCy pipeline.
|
should inherit from the subclass 'TrainablePipe'.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe
|
DOCS: https://nightly.spacy.io/api/pipe
|
||||||
"""
|
"""
|
||||||
def __init__(self, vocab, model, name, **cfg):
|
|
||||||
"""Initialize a pipeline component.
|
|
||||||
|
|
||||||
vocab (Vocab): The shared vocabulary.
|
|
||||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
|
||||||
name (str): The component instance name.
|
|
||||||
**cfg: Additonal settings and config parameters.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#init
|
|
||||||
"""
|
|
||||||
self.vocab = vocab
|
|
||||||
self.model = model
|
|
||||||
self.name = name
|
|
||||||
self.cfg = dict(cfg)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __init_subclass__(cls, **kwargs):
|
def __init_subclass__(cls, **kwargs):
|
||||||
|
@ -41,18 +25,7 @@ cdef class Pipe:
|
||||||
if hasattr(cls, "begin_training"):
|
if hasattr(cls, "begin_training"):
|
||||||
warnings.warn(Warnings.W088.format(name=cls.__name__))
|
warnings.warn(Warnings.W088.format(name=cls.__name__))
|
||||||
|
|
||||||
@property
|
def __call__(self, Doc doc) -> Doc:
|
||||||
def labels(self) -> Optional[Tuple[str]]:
|
|
||||||
return []
|
|
||||||
|
|
||||||
@property
|
|
||||||
def label_data(self):
|
|
||||||
"""Optional JSON-serializable data that would be sufficient to recreate
|
|
||||||
the label set if provided to the `pipe.initialize()` method.
|
|
||||||
"""
|
|
||||||
return None
|
|
||||||
|
|
||||||
def __call__(self, Doc doc):
|
|
||||||
"""Apply the pipe to one document. The document is modified in place,
|
"""Apply the pipe to one document. The document is modified in place,
|
||||||
and returned. This usually happens under the hood when the nlp object
|
and returned. This usually happens under the hood when the nlp object
|
||||||
is called on a text and all components are applied to the Doc.
|
is called on a text and all components are applied to the Doc.
|
||||||
|
@ -62,11 +35,9 @@ cdef class Pipe:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#call
|
DOCS: https://nightly.spacy.io/api/pipe#call
|
||||||
"""
|
"""
|
||||||
scores = self.predict([doc])
|
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="__call__", name=self.name))
|
||||||
self.set_annotations([doc], scores)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
def pipe(self, stream, *, batch_size=128):
|
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
|
||||||
"""Apply the pipe to a stream of documents. This usually happens under
|
"""Apply the pipe to a stream of documents. This usually happens under
|
||||||
the hood when the nlp object is called on a text and all components are
|
the hood when the nlp object is called on a text and all components are
|
||||||
applied to the Doc.
|
applied to the Doc.
|
||||||
|
@ -77,137 +48,17 @@ cdef class Pipe:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#pipe
|
DOCS: https://nightly.spacy.io/api/pipe#pipe
|
||||||
"""
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for doc in stream:
|
||||||
scores = self.predict(docs)
|
doc = self(doc)
|
||||||
self.set_annotations(docs, scores)
|
yield doc
|
||||||
yield from docs
|
|
||||||
|
|
||||||
def predict(self, docs):
|
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None):
|
||||||
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
"""Initialize the pipe. For non-trainable components, this method
|
||||||
Returns a single tensor for a batch of documents.
|
is optional. For trainable components, which should inherit
|
||||||
|
from the subclass TrainablePipe, the provided data examples
|
||||||
docs (Iterable[Doc]): The documents to predict.
|
should be used to ensure that the internal model is initialized
|
||||||
RETURNS: Vector representations for each token in the documents.
|
properly and all input/output dimensions throughout the network are
|
||||||
|
inferred.
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#predict
|
|
||||||
"""
|
|
||||||
raise NotImplementedError(Errors.E931.format(method="predict", name=self.name))
|
|
||||||
|
|
||||||
def set_annotations(self, docs, scores):
|
|
||||||
"""Modify a batch of documents, using pre-computed scores.
|
|
||||||
|
|
||||||
docs (Iterable[Doc]): The documents to modify.
|
|
||||||
scores: The scores to assign.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#set_annotations
|
|
||||||
"""
|
|
||||||
raise NotImplementedError(Errors.E931.format(method="set_annotations", name=self.name))
|
|
||||||
|
|
||||||
def update(self, examples, *, drop=0.0, set_annotations=False, sgd=None, losses=None):
|
|
||||||
"""Learn from a batch of documents and gold-standard information,
|
|
||||||
updating the pipe's model. Delegates to predict and get_loss.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): A batch of Example objects.
|
|
||||||
drop (float): The dropout rate.
|
|
||||||
set_annotations (bool): Whether or not to update the Example objects
|
|
||||||
with the predictions.
|
|
||||||
sgd (thinc.api.Optimizer): The optimizer.
|
|
||||||
losses (Dict[str, float]): Optional record of the loss during training.
|
|
||||||
Updated using the component name as the key.
|
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#update
|
|
||||||
"""
|
|
||||||
if losses is None:
|
|
||||||
losses = {}
|
|
||||||
if not hasattr(self, "model") or self.model in (None, True, False):
|
|
||||||
return losses
|
|
||||||
losses.setdefault(self.name, 0.0)
|
|
||||||
validate_examples(examples, "Pipe.update")
|
|
||||||
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
|
||||||
# Handle cases where there are no tokens in any docs.
|
|
||||||
return
|
|
||||||
set_dropout_rate(self.model, drop)
|
|
||||||
scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples])
|
|
||||||
loss, d_scores = self.get_loss(examples, scores)
|
|
||||||
bp_scores(d_scores)
|
|
||||||
if sgd not in (None, False):
|
|
||||||
self.finish_update(sgd)
|
|
||||||
losses[self.name] += loss
|
|
||||||
if set_annotations:
|
|
||||||
docs = [eg.predicted for eg in examples]
|
|
||||||
self.set_annotations(docs, scores=scores)
|
|
||||||
return losses
|
|
||||||
|
|
||||||
def rehearse(self, examples, *, sgd=None, losses=None, **config):
|
|
||||||
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
|
|
||||||
teach the current model to make predictions similar to an initial model,
|
|
||||||
to try to address the "catastrophic forgetting" problem. This feature is
|
|
||||||
experimental.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): A batch of Example objects.
|
|
||||||
drop (float): The dropout rate.
|
|
||||||
sgd (thinc.api.Optimizer): The optimizer.
|
|
||||||
losses (Dict[str, float]): Optional record of the loss during training.
|
|
||||||
Updated using the component name as the key.
|
|
||||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#rehearse
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
|
||||||
"""Find the loss and gradient of loss for the batch of documents and
|
|
||||||
their predicted scores.
|
|
||||||
|
|
||||||
examples (Iterable[Examples]): The batch of examples.
|
|
||||||
scores: Scores representing the model's predictions.
|
|
||||||
RETURNS (Tuple[float, float]): The loss and the gradient.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#get_loss
|
|
||||||
"""
|
|
||||||
raise NotImplementedError(Errors.E931.format(method="get_loss", name=self.name))
|
|
||||||
|
|
||||||
def add_label(self, label):
|
|
||||||
"""Add an output label, to be predicted by the model. It's possible to
|
|
||||||
extend pretrained models with new labels, but care should be taken to
|
|
||||||
avoid the "catastrophic forgetting" problem.
|
|
||||||
|
|
||||||
label (str): The label to add.
|
|
||||||
RETURNS (int): 0 if label is already present, otherwise 1.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#add_label
|
|
||||||
"""
|
|
||||||
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
|
|
||||||
|
|
||||||
|
|
||||||
def _require_labels(self) -> None:
|
|
||||||
"""Raise an error if the component's model has no labels defined."""
|
|
||||||
if not self.labels or list(self.labels) == [""]:
|
|
||||||
raise ValueError(Errors.E143.format(name=self.name))
|
|
||||||
|
|
||||||
|
|
||||||
def _allow_extra_label(self) -> None:
|
|
||||||
"""Raise an error if the component can not add any more labels."""
|
|
||||||
if self.model.has_dim("nO") and self.model.get_dim("nO") == len(self.labels):
|
|
||||||
if not self.is_resizable():
|
|
||||||
raise ValueError(Errors.E922.format(name=self.name, nO=self.model.get_dim("nO")))
|
|
||||||
|
|
||||||
|
|
||||||
def create_optimizer(self):
|
|
||||||
"""Create an optimizer for the pipeline component.
|
|
||||||
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#create_optimizer
|
|
||||||
"""
|
|
||||||
return util.create_default_optimizer()
|
|
||||||
|
|
||||||
def initialize(self, get_examples, *, nlp=None):
|
|
||||||
"""Initialize the pipe for training, using data examples if available.
|
|
||||||
This method needs to be implemented by each Pipe component,
|
|
||||||
ensuring the internal model (if available) is initialized properly
|
|
||||||
using the provided sample of Example objects.
|
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
|
@ -217,49 +68,7 @@ cdef class Pipe:
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def _ensure_examples(self, get_examples):
|
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Union[float, Dict[str, float]]]:
|
||||||
if get_examples is None or not hasattr(get_examples, "__call__"):
|
|
||||||
err = Errors.E930.format(name=self.name, obj=type(get_examples))
|
|
||||||
raise ValueError(err)
|
|
||||||
if not get_examples():
|
|
||||||
err = Errors.E930.format(name=self.name, obj=get_examples())
|
|
||||||
raise ValueError(err)
|
|
||||||
|
|
||||||
def is_resizable(self):
|
|
||||||
return hasattr(self, "model") and "resize_output" in self.model.attrs
|
|
||||||
|
|
||||||
def is_trainable(self):
|
|
||||||
return hasattr(self, "model") and isinstance(self.model, Model)
|
|
||||||
|
|
||||||
def set_output(self, nO):
|
|
||||||
if self.is_resizable():
|
|
||||||
self.model.attrs["resize_output"](self.model, nO)
|
|
||||||
else:
|
|
||||||
raise NotImplementedError(Errors.E921)
|
|
||||||
|
|
||||||
def use_params(self, params):
|
|
||||||
"""Modify the pipe's model, to use the given parameter values. At the
|
|
||||||
end of the context, the original parameters are restored.
|
|
||||||
|
|
||||||
params (dict): The parameter values to use in the model.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#use_params
|
|
||||||
"""
|
|
||||||
with self.model.use_params(params):
|
|
||||||
yield
|
|
||||||
|
|
||||||
def finish_update(self, sgd):
|
|
||||||
"""Update parameters using the current parameter gradients.
|
|
||||||
The Optimizer instance contains the functionality to perform
|
|
||||||
the stochastic gradient descent.
|
|
||||||
|
|
||||||
sgd (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#finish_update
|
|
||||||
"""
|
|
||||||
self.model.finish_update(sgd)
|
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
"""Score a batch of examples.
|
"""Score a batch of examples.
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
examples (Iterable[Example]): The examples to score.
|
||||||
|
@ -269,81 +78,25 @@ cdef class Pipe:
|
||||||
"""
|
"""
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def to_bytes(self, *, exclude=tuple()):
|
@property
|
||||||
"""Serialize the pipe to a bytestring.
|
def is_trainable(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
@property
|
||||||
RETURNS (bytes): The serialized object.
|
def labels(self) -> Optional[Tuple[str]]:
|
||||||
|
return tuple()
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#to_bytes
|
@property
|
||||||
|
def label_data(self):
|
||||||
|
"""Optional JSON-serializable data that would be sufficient to recreate
|
||||||
|
the label set if provided to the `pipe.initialize()` method.
|
||||||
"""
|
"""
|
||||||
serialize = {}
|
return None
|
||||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
|
||||||
serialize["model"] = self.model.to_bytes
|
|
||||||
if hasattr(self, "vocab"):
|
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
|
||||||
return util.to_bytes(serialize, exclude)
|
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
|
||||||
"""Load the pipe from a bytestring.
|
|
||||||
|
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
||||||
RETURNS (Pipe): The loaded object.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#from_bytes
|
|
||||||
"""
|
|
||||||
|
|
||||||
def load_model(b):
|
|
||||||
try:
|
|
||||||
self.model.from_bytes(b)
|
|
||||||
except AttributeError:
|
|
||||||
raise ValueError(Errors.E149) from None
|
|
||||||
|
|
||||||
deserialize = {}
|
|
||||||
if hasattr(self, "vocab"):
|
|
||||||
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
|
||||||
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
|
|
||||||
deserialize["model"] = load_model
|
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
|
||||||
"""Serialize the pipe to disk.
|
|
||||||
|
|
||||||
path (str / Path): Path to a directory.
|
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#to_disk
|
|
||||||
"""
|
|
||||||
serialize = {}
|
|
||||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
|
||||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
|
||||||
serialize["model"] = lambda p: self.model.to_disk(p)
|
|
||||||
util.to_disk(path, serialize, exclude)
|
|
||||||
|
|
||||||
def from_disk(self, path, *, exclude=tuple()):
|
|
||||||
"""Load the pipe from disk.
|
|
||||||
|
|
||||||
path (str / Path): Path to a directory.
|
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
||||||
RETURNS (Pipe): The loaded object.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#from_disk
|
|
||||||
"""
|
|
||||||
|
|
||||||
def load_model(p):
|
|
||||||
try:
|
|
||||||
self.model.from_bytes(p.open("rb").read())
|
|
||||||
except AttributeError:
|
|
||||||
raise ValueError(Errors.E149) from None
|
|
||||||
|
|
||||||
deserialize = {}
|
|
||||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
|
||||||
deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
|
|
||||||
deserialize["model"] = load_model
|
|
||||||
util.from_disk(path, deserialize, exclude)
|
|
||||||
return self
|
|
||||||
|
|
||||||
|
def _require_labels(self) -> None:
|
||||||
|
"""Raise an error if this component has no labels defined."""
|
||||||
|
if not self.labels or list(self.labels) == [""]:
|
||||||
|
raise ValueError(Errors.E143.format(name=self.name))
|
||||||
|
|
||||||
def deserialize_config(path):
|
def deserialize_config(path):
|
||||||
if path.exists():
|
if path.exists():
|
||||||
|
|
|
@ -58,9 +58,6 @@ class Sentencizer(Pipe):
|
||||||
else:
|
else:
|
||||||
self.punct_chars = set(self.default_punct_chars)
|
self.punct_chars = set(self.default_punct_chars)
|
||||||
|
|
||||||
def initialize(self, get_examples, nlp=None):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
||||||
|
|
||||||
|
@ -204,9 +201,3 @@ class Sentencizer(Pipe):
|
||||||
cfg = srsly.read_json(path)
|
cfg = srsly.read_json(path)
|
||||||
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
|
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def add_label(self, label):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
|
@ -6,12 +6,11 @@ from thinc.api import Model, SequenceCategoricalCrossentropy, Config
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
from .pipe import deserialize_config
|
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..training import validate_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -62,6 +61,7 @@ class SentenceRecognizer(Tagger):
|
||||||
self.name = name
|
self.name = name
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
self.cfg = {}
|
self.cfg = {}
|
||||||
|
self._added_strings = set()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -138,7 +138,7 @@ class SentenceRecognizer(Tagger):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
validate_get_examples(get_examples, "SentenceRecognizer.initialize")
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
label_sample = []
|
label_sample = []
|
||||||
assert self.labels, Errors.E924.format(name=self.name)
|
assert self.labels, Errors.E924.format(name=self.name)
|
||||||
|
|
|
@ -11,13 +11,14 @@ from ..tokens.doc cimport Doc
|
||||||
from ..morphology cimport Morphology
|
from ..morphology cimport Morphology
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
|
|
||||||
from .pipe import Pipe, deserialize_config
|
from .trainable_pipe import TrainablePipe
|
||||||
|
from .pipe import deserialize_config
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..attrs import POS, ID
|
from ..attrs import POS, ID
|
||||||
from ..parts_of_speech import X
|
from ..parts_of_speech import X
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..training import validate_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -55,7 +56,7 @@ def make_tagger(nlp: Language, name: str, model: Model):
|
||||||
return Tagger(nlp.vocab, model, name)
|
return Tagger(nlp.vocab, model, name)
|
||||||
|
|
||||||
|
|
||||||
class Tagger(Pipe):
|
class Tagger(TrainablePipe):
|
||||||
"""Pipeline component for part-of-speech tagging.
|
"""Pipeline component for part-of-speech tagging.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tagger
|
DOCS: https://nightly.spacy.io/api/tagger
|
||||||
|
@ -77,6 +78,7 @@ class Tagger(Pipe):
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
cfg = {"labels": labels or []}
|
cfg = {"labels": labels or []}
|
||||||
self.cfg = dict(sorted(cfg.items()))
|
self.cfg = dict(sorted(cfg.items()))
|
||||||
|
self._added_strings = set()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -274,7 +276,7 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tagger#initialize
|
DOCS: https://nightly.spacy.io/api/tagger#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
validate_get_examples(get_examples, "Tagger.initialize")
|
||||||
if labels is not None:
|
if labels is not None:
|
||||||
for tag in labels:
|
for tag in labels:
|
||||||
self.add_label(tag)
|
self.add_label(tag)
|
||||||
|
@ -311,7 +313,7 @@ class Tagger(Pipe):
|
||||||
return 0
|
return 0
|
||||||
self._allow_extra_label()
|
self._allow_extra_label()
|
||||||
self.cfg["labels"].append(label)
|
self.cfg["labels"].append(label)
|
||||||
self.vocab.strings.add(label)
|
self.add_string(label)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
def score(self, examples, **kwargs):
|
||||||
|
@ -325,79 +327,3 @@ class Tagger(Pipe):
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Tagger.score")
|
validate_examples(examples, "Tagger.score")
|
||||||
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
||||||
|
|
||||||
def to_bytes(self, *, exclude=tuple()):
|
|
||||||
"""Serialize the pipe to a bytestring.
|
|
||||||
|
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
||||||
RETURNS (bytes): The serialized object.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tagger#to_bytes
|
|
||||||
"""
|
|
||||||
serialize = {}
|
|
||||||
serialize["model"] = self.model.to_bytes
|
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
|
||||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
|
||||||
return util.to_bytes(serialize, exclude)
|
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
|
||||||
"""Load the pipe from a bytestring.
|
|
||||||
|
|
||||||
bytes_data (bytes): The serialized pipe.
|
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
||||||
RETURNS (Tagger): The loaded Tagger.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tagger#from_bytes
|
|
||||||
"""
|
|
||||||
def load_model(b):
|
|
||||||
try:
|
|
||||||
self.model.from_bytes(b)
|
|
||||||
except AttributeError:
|
|
||||||
raise ValueError(Errors.E149) from None
|
|
||||||
|
|
||||||
deserialize = {
|
|
||||||
"vocab": lambda b: self.vocab.from_bytes(b),
|
|
||||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
|
||||||
"model": lambda b: load_model(b),
|
|
||||||
}
|
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
|
||||||
"""Serialize the pipe to disk.
|
|
||||||
|
|
||||||
path (str / Path): Path to a directory.
|
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tagger#to_disk
|
|
||||||
"""
|
|
||||||
serialize = {
|
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
|
||||||
"model": lambda p: self.model.to_disk(p),
|
|
||||||
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
|
||||||
}
|
|
||||||
util.to_disk(path, serialize, exclude)
|
|
||||||
|
|
||||||
def from_disk(self, path, *, exclude=tuple()):
|
|
||||||
"""Load the pipe from disk. Modifies the object in place and returns it.
|
|
||||||
|
|
||||||
path (str / Path): Path to a directory.
|
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
||||||
RETURNS (Tagger): The modified Tagger object.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tagger#from_disk
|
|
||||||
"""
|
|
||||||
def load_model(p):
|
|
||||||
with p.open("rb") as file_:
|
|
||||||
try:
|
|
||||||
self.model.from_bytes(file_.read())
|
|
||||||
except AttributeError:
|
|
||||||
raise ValueError(Errors.E149) from None
|
|
||||||
|
|
||||||
deserialize = {
|
|
||||||
"vocab": lambda p: self.vocab.from_disk(p),
|
|
||||||
"cfg": lambda p: self.cfg.update(deserialize_config(p)),
|
|
||||||
"model": load_model,
|
|
||||||
}
|
|
||||||
util.from_disk(path, deserialize, exclude)
|
|
||||||
return self
|
|
||||||
|
|
|
@ -4,9 +4,9 @@ from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Conf
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .trainable_pipe import TrainablePipe
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..training import Example, validate_examples
|
from ..training import Example, validate_examples, validate_get_examples
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from .. import util
|
from .. import util
|
||||||
|
@ -85,7 +85,7 @@ def make_textcat(
|
||||||
return TextCategorizer(nlp.vocab, model, name, threshold=threshold)
|
return TextCategorizer(nlp.vocab, model, name, threshold=threshold)
|
||||||
|
|
||||||
|
|
||||||
class TextCategorizer(Pipe):
|
class TextCategorizer(TrainablePipe):
|
||||||
"""Pipeline component for text classification.
|
"""Pipeline component for text classification.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/textcategorizer
|
DOCS: https://nightly.spacy.io/api/textcategorizer
|
||||||
|
@ -110,6 +110,7 @@ class TextCategorizer(Pipe):
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
cfg = {"labels": [], "threshold": threshold, "positive_label": None}
|
cfg = {"labels": [], "threshold": threshold, "positive_label": None}
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
|
self._added_strings = set()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self) -> Tuple[str]:
|
def labels(self) -> Tuple[str]:
|
||||||
|
@ -119,13 +120,6 @@ class TextCategorizer(Pipe):
|
||||||
"""
|
"""
|
||||||
return tuple(self.cfg["labels"])
|
return tuple(self.cfg["labels"])
|
||||||
|
|
||||||
@labels.setter
|
|
||||||
def labels(self, value: List[str]) -> None:
|
|
||||||
# TODO: This really shouldn't be here. I had a look and I added it when
|
|
||||||
# I added the labels property, but it's pretty nasty to have this, and
|
|
||||||
# will lead to problems.
|
|
||||||
self.cfg["labels"] = tuple(value)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def label_data(self) -> List[str]:
|
def label_data(self) -> List[str]:
|
||||||
"""RETURNS (List[str]): Information about the component's labels."""
|
"""RETURNS (List[str]): Information about the component's labels."""
|
||||||
|
@ -306,7 +300,8 @@ class TextCategorizer(Pipe):
|
||||||
if label in self.labels:
|
if label in self.labels:
|
||||||
return 0
|
return 0
|
||||||
self._allow_extra_label()
|
self._allow_extra_label()
|
||||||
self.labels = tuple(list(self.labels) + [label])
|
self.cfg["labels"].append(label)
|
||||||
|
self.add_string(label)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def initialize(
|
def initialize(
|
||||||
|
@ -329,7 +324,7 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
|
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
validate_get_examples(get_examples, "TextCategorizer.initialize")
|
||||||
if labels is None:
|
if labels is None:
|
||||||
for example in get_examples():
|
for example in get_examples():
|
||||||
for cat in example.y.cats:
|
for cat in example.y.cats:
|
||||||
|
|
|
@ -2,8 +2,8 @@ from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List
|
||||||
from thinc.api import Model, set_dropout_rate, Optimizer, Config
|
from thinc.api import Model, set_dropout_rate, Optimizer, Config
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .trainable_pipe import TrainablePipe
|
||||||
from ..training import Example, validate_examples
|
from ..training import Example, validate_examples, validate_get_examples
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
@ -32,7 +32,7 @@ def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec":
|
||||||
return Tok2Vec(nlp.vocab, model, name)
|
return Tok2Vec(nlp.vocab, model, name)
|
||||||
|
|
||||||
|
|
||||||
class Tok2Vec(Pipe):
|
class Tok2Vec(TrainablePipe):
|
||||||
"""Apply a "token-to-vector" model and set its outputs in the doc.tensor
|
"""Apply a "token-to-vector" model and set its outputs in the doc.tensor
|
||||||
attribute. This is mostly useful to share a single subnetwork between multiple
|
attribute. This is mostly useful to share a single subnetwork between multiple
|
||||||
components, e.g. to have one embedding and CNN network shared between a
|
components, e.g. to have one embedding and CNN network shared between a
|
||||||
|
@ -64,6 +64,7 @@ class Tok2Vec(Pipe):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.listeners = []
|
self.listeners = []
|
||||||
self.cfg = {}
|
self.cfg = {}
|
||||||
|
self._added_strings = set()
|
||||||
|
|
||||||
def add_listener(self, listener: "Tok2VecListener") -> None:
|
def add_listener(self, listener: "Tok2VecListener") -> None:
|
||||||
"""Add a listener for a downstream component. Usually internals."""
|
"""Add a listener for a downstream component. Usually internals."""
|
||||||
|
@ -218,7 +219,7 @@ class Tok2Vec(Pipe):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tok2vec#initialize
|
DOCS: https://nightly.spacy.io/api/tok2vec#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
validate_get_examples(get_examples, "Tok2Vec.initialize")
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
for example in islice(get_examples(), 10):
|
for example in islice(get_examples(), 10):
|
||||||
doc_sample.append(example.x)
|
doc_sample.append(example.x)
|
||||||
|
|
8
spacy/pipeline/trainable_pipe.pxd
Normal file
8
spacy/pipeline/trainable_pipe.pxd
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
from .pipe cimport Pipe
|
||||||
|
from ..vocab cimport Vocab
|
||||||
|
|
||||||
|
cdef class TrainablePipe(Pipe):
|
||||||
|
cdef public Vocab vocab
|
||||||
|
cdef public object model
|
||||||
|
cdef public object cfg
|
||||||
|
cdef public set _added_strings
|
322
spacy/pipeline/trainable_pipe.pyx
Normal file
322
spacy/pipeline/trainable_pipe.pyx
Normal file
|
@ -0,0 +1,322 @@
|
||||||
|
# cython: infer_types=True, profile=True
|
||||||
|
from typing import Iterable, Iterator, Optional, Dict, Tuple, Callable
|
||||||
|
import srsly
|
||||||
|
from thinc.api import set_dropout_rate, Model, Optimizer
|
||||||
|
|
||||||
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
|
from ..training import validate_examples
|
||||||
|
from ..errors import Errors
|
||||||
|
from .pipe import Pipe, deserialize_config
|
||||||
|
from .. import util
|
||||||
|
from ..vocab import Vocab
|
||||||
|
from ..language import Language
|
||||||
|
from ..training import Example
|
||||||
|
|
||||||
|
cdef class TrainablePipe(Pipe):
|
||||||
|
"""This class is a base class and not instantiated directly. Trainable
|
||||||
|
pipeline components like the EntityRecognizer or TextCategorizer inherit
|
||||||
|
from it and it defines the interface that components should follow to
|
||||||
|
function as trainable components in a spaCy pipeline.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe
|
||||||
|
"""
|
||||||
|
def __init__(self, vocab: Vocab, model: Model, name: str, **cfg):
|
||||||
|
"""Initialize a pipeline component.
|
||||||
|
|
||||||
|
vocab (Vocab): The shared vocabulary.
|
||||||
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
|
name (str): The component instance name.
|
||||||
|
**cfg: Additonal settings and config parameters.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#init
|
||||||
|
"""
|
||||||
|
self.vocab = vocab
|
||||||
|
self.model = model
|
||||||
|
self.name = name
|
||||||
|
self.cfg = dict(cfg)
|
||||||
|
self._added_strings = set()
|
||||||
|
|
||||||
|
def __call__(self, Doc doc) -> Doc:
|
||||||
|
"""Apply the pipe to one document. The document is modified in place,
|
||||||
|
and returned. This usually happens under the hood when the nlp object
|
||||||
|
is called on a text and all components are applied to the Doc.
|
||||||
|
|
||||||
|
docs (Doc): The Doc to process.
|
||||||
|
RETURNS (Doc): The processed Doc.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#call
|
||||||
|
"""
|
||||||
|
scores = self.predict([doc])
|
||||||
|
self.set_annotations([doc], scores)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
|
||||||
|
"""Apply the pipe to a stream of documents. This usually happens under
|
||||||
|
the hood when the nlp object is called on a text and all components are
|
||||||
|
applied to the Doc.
|
||||||
|
|
||||||
|
stream (Iterable[Doc]): A stream of documents.
|
||||||
|
batch_size (int): The number of documents to buffer.
|
||||||
|
YIELDS (Doc): Processed documents in order.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#pipe
|
||||||
|
"""
|
||||||
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
|
scores = self.predict(docs)
|
||||||
|
self.set_annotations(docs, scores)
|
||||||
|
yield from docs
|
||||||
|
|
||||||
|
def predict(self, docs: Iterable[Doc]):
|
||||||
|
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
|
Returns a single tensor for a batch of documents.
|
||||||
|
|
||||||
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
|
RETURNS: Vector representations of the predictions.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#predict
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="predict", name=self.name))
|
||||||
|
|
||||||
|
def set_annotations(self, docs: Iterable[Doc], scores):
|
||||||
|
"""Modify a batch of documents, using pre-computed scores.
|
||||||
|
|
||||||
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
|
scores: The scores to assign.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#set_annotations
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="set_annotations", name=self.name))
|
||||||
|
|
||||||
|
def update(self,
|
||||||
|
examples: Iterable["Example"],
|
||||||
|
*, drop: float=0.0,
|
||||||
|
set_annotations: bool=False,
|
||||||
|
sgd: Optimizer=None,
|
||||||
|
losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
|
||||||
|
"""Learn from a batch of documents and gold-standard information,
|
||||||
|
updating the pipe's model. Delegates to predict and get_loss.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): A batch of Example objects.
|
||||||
|
drop (float): The dropout rate.
|
||||||
|
set_annotations (bool): Whether or not to update the Example objects
|
||||||
|
with the predictions.
|
||||||
|
sgd (thinc.api.Optimizer): The optimizer.
|
||||||
|
losses (Dict[str, float]): Optional record of the loss during training.
|
||||||
|
Updated using the component name as the key.
|
||||||
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#update
|
||||||
|
"""
|
||||||
|
if losses is None:
|
||||||
|
losses = {}
|
||||||
|
if not hasattr(self, "model") or self.model in (None, True, False):
|
||||||
|
return losses
|
||||||
|
losses.setdefault(self.name, 0.0)
|
||||||
|
validate_examples(examples, "TrainablePipe.update")
|
||||||
|
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||||
|
# Handle cases where there are no tokens in any docs.
|
||||||
|
return
|
||||||
|
set_dropout_rate(self.model, drop)
|
||||||
|
scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples])
|
||||||
|
loss, d_scores = self.get_loss(examples, scores)
|
||||||
|
bp_scores(d_scores)
|
||||||
|
if sgd not in (None, False):
|
||||||
|
self.finish_update(sgd)
|
||||||
|
losses[self.name] += loss
|
||||||
|
if set_annotations:
|
||||||
|
docs = [eg.predicted for eg in examples]
|
||||||
|
self.set_annotations(docs, scores=scores)
|
||||||
|
return losses
|
||||||
|
|
||||||
|
def rehearse(self,
|
||||||
|
examples: Iterable[Example],
|
||||||
|
*,
|
||||||
|
sgd: Optimizer=None,
|
||||||
|
losses: Dict[str, float]=None,
|
||||||
|
**config) -> Dict[str, float]:
|
||||||
|
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
|
||||||
|
teach the current model to make predictions similar to an initial model,
|
||||||
|
to try to address the "catastrophic forgetting" problem. This feature is
|
||||||
|
experimental.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): A batch of Example objects.
|
||||||
|
sgd (thinc.api.Optimizer): The optimizer.
|
||||||
|
losses (Dict[str, float]): Optional record of the loss during training.
|
||||||
|
Updated using the component name as the key.
|
||||||
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#rehearse
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
|
||||||
|
"""Find the loss and gradient of loss for the batch of documents and
|
||||||
|
their predicted scores.
|
||||||
|
|
||||||
|
examples (Iterable[Examples]): The batch of examples.
|
||||||
|
scores: Scores representing the model's predictions.
|
||||||
|
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#get_loss
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name))
|
||||||
|
|
||||||
|
def create_optimizer(self) -> Optimizer:
|
||||||
|
"""Create an optimizer for the pipeline component.
|
||||||
|
|
||||||
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#create_optimizer
|
||||||
|
"""
|
||||||
|
return util.create_default_optimizer()
|
||||||
|
|
||||||
|
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None):
|
||||||
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
|
This method needs to be implemented by each TrainablePipe component,
|
||||||
|
ensuring the internal model (if available) is initialized properly
|
||||||
|
using the provided sample of Example objects.
|
||||||
|
|
||||||
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
|
returns a representative sample of gold-standard Example objects.
|
||||||
|
nlp (Language): The current nlp object the component is part of.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#initialize
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="initialize", name=self.name))
|
||||||
|
|
||||||
|
def add_label(self, label: str) -> int:
|
||||||
|
"""Add an output label.
|
||||||
|
For TrainablePipe components, it is possible to
|
||||||
|
extend pretrained models with new labels, but care should be taken to
|
||||||
|
avoid the "catastrophic forgetting" problem.
|
||||||
|
|
||||||
|
label (str): The label to add.
|
||||||
|
RETURNS (int): 0 if label is already present, otherwise 1.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#add_label
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
|
||||||
|
|
||||||
|
def add_string(self, string: str):
|
||||||
|
self._added_strings.add(string)
|
||||||
|
return self.vocab.strings.add(string)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_trainable(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_resizable(self) -> bool:
|
||||||
|
return getattr(self, "model", None) and "resize_output" in self.model.attrs
|
||||||
|
|
||||||
|
def _allow_extra_label(self) -> None:
|
||||||
|
"""Raise an error if the component can not add any more labels."""
|
||||||
|
if self.model.has_dim("nO") and self.model.get_dim("nO") == len(self.labels):
|
||||||
|
if not self.is_resizable:
|
||||||
|
raise ValueError(Errors.E922.format(name=self.name, nO=self.model.get_dim("nO")))
|
||||||
|
|
||||||
|
def set_output(self, nO: int) -> None:
|
||||||
|
if self.is_resizable:
|
||||||
|
self.model.attrs["resize_output"](self.model, nO)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(Errors.E921)
|
||||||
|
|
||||||
|
def use_params(self, params: dict):
|
||||||
|
"""Modify the pipe's model, to use the given parameter values. At the
|
||||||
|
end of the context, the original parameters are restored.
|
||||||
|
|
||||||
|
params (dict): The parameter values to use in the model.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#use_params
|
||||||
|
"""
|
||||||
|
with self.model.use_params(params):
|
||||||
|
yield
|
||||||
|
|
||||||
|
def finish_update(self, sgd: Optimizer) -> None:
|
||||||
|
"""Update parameters using the current parameter gradients.
|
||||||
|
The Optimizer instance contains the functionality to perform
|
||||||
|
the stochastic gradient descent.
|
||||||
|
|
||||||
|
sgd (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#finish_update
|
||||||
|
"""
|
||||||
|
self.model.finish_update(sgd)
|
||||||
|
|
||||||
|
def to_bytes(self, *, exclude=tuple()):
|
||||||
|
"""Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#to_bytes
|
||||||
|
"""
|
||||||
|
serialize = {}
|
||||||
|
if hasattr(self, "cfg"):
|
||||||
|
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||||
|
serialize["model"] = self.model.to_bytes
|
||||||
|
serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings))
|
||||||
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
|
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
||||||
|
"""Load the pipe from a bytestring.
|
||||||
|
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (TrainablePipe): The loaded object.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#from_bytes
|
||||||
|
"""
|
||||||
|
|
||||||
|
def load_model(b):
|
||||||
|
try:
|
||||||
|
self.model.from_bytes(b)
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError(Errors.E149) from None
|
||||||
|
|
||||||
|
deserialize = {}
|
||||||
|
deserialize["strings.json"] = lambda b: [self.add_string(s) for s in srsly.json_loads(b)]
|
||||||
|
if hasattr(self, "cfg"):
|
||||||
|
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
|
||||||
|
deserialize["model"] = load_model
|
||||||
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def to_disk(self, path, *, exclude=tuple()):
|
||||||
|
"""Serialize the pipe to disk.
|
||||||
|
|
||||||
|
path (str / Path): Path to a directory.
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#to_disk
|
||||||
|
"""
|
||||||
|
serialize = {}
|
||||||
|
if hasattr(self, "cfg"):
|
||||||
|
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||||
|
serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings)
|
||||||
|
serialize["model"] = lambda p: self.model.to_disk(p)
|
||||||
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
|
def from_disk(self, path, *, exclude=tuple()):
|
||||||
|
"""Load the pipe from disk.
|
||||||
|
|
||||||
|
path (str / Path): Path to a directory.
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (TrainablePipe): The loaded object.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/pipe#from_disk
|
||||||
|
"""
|
||||||
|
|
||||||
|
def load_model(p):
|
||||||
|
try:
|
||||||
|
self.model.from_bytes(p.open("rb").read())
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError(Errors.E149) from None
|
||||||
|
|
||||||
|
deserialize = {}
|
||||||
|
deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)]
|
||||||
|
if hasattr(self, "cfg"):
|
||||||
|
deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
|
||||||
|
deserialize["model"] = load_model
|
||||||
|
util.from_disk(path, deserialize, exclude)
|
||||||
|
return self
|
|
@ -1,13 +1,13 @@
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from .pipe cimport Pipe
|
from .trainable_pipe cimport TrainablePipe
|
||||||
from ._parser_internals.transition_system cimport Transition, TransitionSystem
|
from ._parser_internals.transition_system cimport Transition, TransitionSystem
|
||||||
from ._parser_internals._state cimport StateC
|
from ._parser_internals._state cimport StateC
|
||||||
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
|
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser(Pipe):
|
cdef class Parser(TrainablePipe):
|
||||||
cdef public object _rehearsal_model
|
cdef public object _rehearsal_model
|
||||||
cdef readonly TransitionSystem moves
|
cdef readonly TransitionSystem moves
|
||||||
cdef public object _multitasks
|
cdef public object _multitasks
|
||||||
|
|
|
@ -21,13 +21,14 @@ from ..ml.parser_model cimport predict_states, arg_max_if_valid
|
||||||
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
||||||
from ..ml.parser_model cimport get_c_weights, get_c_sizes
|
from ..ml.parser_model cimport get_c_weights, get_c_sizes
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
from .trainable_pipe import TrainablePipe
|
||||||
|
|
||||||
from ..training import validate_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser(Pipe):
|
cdef class Parser(TrainablePipe):
|
||||||
"""
|
"""
|
||||||
Base class of the DependencyParser and EntityRecognizer.
|
Base class of the DependencyParser and EntityRecognizer.
|
||||||
"""
|
"""
|
||||||
|
@ -75,6 +76,7 @@ cdef class Parser(Pipe):
|
||||||
self.add_multitask_objective(multitask)
|
self.add_multitask_objective(multitask)
|
||||||
|
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
|
self._added_strings = set()
|
||||||
|
|
||||||
def __getnewargs_ex__(self):
|
def __getnewargs_ex__(self):
|
||||||
"""This allows pickling the Parser and its keyword-only init arguments"""
|
"""This allows pickling the Parser and its keyword-only init arguments"""
|
||||||
|
@ -118,6 +120,7 @@ cdef class Parser(Pipe):
|
||||||
resized = True
|
resized = True
|
||||||
if resized:
|
if resized:
|
||||||
self._resize()
|
self._resize()
|
||||||
|
self.add_string(label)
|
||||||
return 1
|
return 1
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
@ -411,7 +414,7 @@ cdef class Parser(Pipe):
|
||||||
self.model.attrs["resize_output"](self.model, nO)
|
self.model.attrs["resize_output"](self.model, nO)
|
||||||
|
|
||||||
def initialize(self, get_examples, nlp=None, labels=None):
|
def initialize(self, get_examples, nlp=None, labels=None):
|
||||||
self._ensure_examples(get_examples)
|
validate_get_examples(get_examples, "Parser.initialize")
|
||||||
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
||||||
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
||||||
langs = ", ".join(util.LEXEME_NORM_LANGS)
|
langs = ", ".join(util.LEXEME_NORM_LANGS)
|
||||||
|
@ -439,7 +442,7 @@ cdef class Parser(Pipe):
|
||||||
break
|
break
|
||||||
# non-trainable components may have a pipe() implementation that refers to dummy
|
# non-trainable components may have a pipe() implementation that refers to dummy
|
||||||
# predict and set_annotations methods
|
# predict and set_annotations methods
|
||||||
if hasattr(component, "pipe") and hasattr(component, "is_trainable") and component.is_trainable():
|
if hasattr(component, "pipe"):
|
||||||
doc_sample = list(component.pipe(doc_sample, batch_size=8))
|
doc_sample = list(component.pipe(doc_sample, batch_size=8))
|
||||||
else:
|
else:
|
||||||
doc_sample = [component(doc) for doc in doc_sample]
|
doc_sample = [component(doc) for doc in doc_sample]
|
||||||
|
@ -454,7 +457,7 @@ cdef class Parser(Pipe):
|
||||||
def to_disk(self, path, exclude=tuple()):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
serializers = {
|
serializers = {
|
||||||
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
|
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
|
||||||
'vocab': lambda p: self.vocab.to_disk(p),
|
'strings.json': lambda p: srsly.write_json(p, self._added_strings),
|
||||||
'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]),
|
'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]),
|
||||||
'cfg': lambda p: srsly.write_json(p, self.cfg)
|
'cfg': lambda p: srsly.write_json(p, self.cfg)
|
||||||
}
|
}
|
||||||
|
@ -462,7 +465,7 @@ cdef class Parser(Pipe):
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple()):
|
def from_disk(self, path, exclude=tuple()):
|
||||||
deserializers = {
|
deserializers = {
|
||||||
'vocab': lambda p: self.vocab.from_disk(p),
|
'strings.json': lambda p: [self.add_string(s) for s in srsly.read_json(p)],
|
||||||
'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]),
|
'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]),
|
||||||
'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
|
'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
|
||||||
'model': lambda p: None,
|
'model': lambda p: None,
|
||||||
|
@ -482,7 +485,7 @@ cdef class Parser(Pipe):
|
||||||
def to_bytes(self, exclude=tuple()):
|
def to_bytes(self, exclude=tuple()):
|
||||||
serializers = {
|
serializers = {
|
||||||
"model": lambda: (self.model.to_bytes()),
|
"model": lambda: (self.model.to_bytes()),
|
||||||
"vocab": lambda: self.vocab.to_bytes(),
|
"strings.json": lambda: srsly.json_dumps(sorted(self._added_strings)),
|
||||||
"moves": lambda: self.moves.to_bytes(exclude=["strings"]),
|
"moves": lambda: self.moves.to_bytes(exclude=["strings"]),
|
||||||
"cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)
|
"cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)
|
||||||
}
|
}
|
||||||
|
@ -490,7 +493,7 @@ cdef class Parser(Pipe):
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple()):
|
def from_bytes(self, bytes_data, exclude=tuple()):
|
||||||
deserializers = {
|
deserializers = {
|
||||||
"vocab": lambda b: self.vocab.from_bytes(b),
|
"strings.json": lambda b: [self.add_string(s) for s in srsly.json_loads(b)],
|
||||||
"moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]),
|
"moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]),
|
||||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
||||||
"model": lambda b: None,
|
"model": lambda b: None,
|
||||||
|
|
|
@ -368,7 +368,7 @@ class ConfigSchemaInit(BaseModel):
|
||||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
||||||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||||
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
|
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
|
||||||
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for Pipe.initialize methods of pipeline components, keyed by component")
|
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
|
|
|
@ -133,7 +133,7 @@ def test_kb_custom_length(nlp):
|
||||||
def test_kb_initialize_empty(nlp):
|
def test_kb_initialize_empty(nlp):
|
||||||
"""Test that the EL can't initialize without examples"""
|
"""Test that the EL can't initialize without examples"""
|
||||||
entity_linker = nlp.add_pipe("entity_linker")
|
entity_linker = nlp.add_pipe("entity_linker")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(TypeError):
|
||||||
entity_linker.initialize(lambda: [])
|
entity_linker.initialize(lambda: [])
|
||||||
|
|
||||||
|
|
||||||
|
@ -153,6 +153,23 @@ def test_kb_serialize(nlp):
|
||||||
mykb.from_disk(d / "unknown" / "kb")
|
mykb.from_disk(d / "unknown" / "kb")
|
||||||
|
|
||||||
|
|
||||||
|
def test_kb_serialize_vocab(nlp):
|
||||||
|
"""Test serialization of the KB and custom strings"""
|
||||||
|
entity = "MyFunnyID"
|
||||||
|
assert entity not in nlp.vocab.strings
|
||||||
|
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||||
|
assert not mykb.contains_entity(entity)
|
||||||
|
mykb.add_entity(entity, freq=342, entity_vector=[3])
|
||||||
|
assert mykb.contains_entity(entity)
|
||||||
|
assert entity in mykb.vocab.strings
|
||||||
|
with make_tempdir() as d:
|
||||||
|
# normal read-write behaviour
|
||||||
|
mykb.to_disk(d / "kb")
|
||||||
|
mykb_new = KnowledgeBase(Vocab(), entity_vector_length=1)
|
||||||
|
mykb_new.from_disk(d / "kb")
|
||||||
|
assert entity in mykb_new.vocab.strings
|
||||||
|
|
||||||
|
|
||||||
def test_candidate_generation(nlp):
|
def test_candidate_generation(nlp):
|
||||||
"""Test correct candidate generation"""
|
"""Test correct candidate generation"""
|
||||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||||
|
@ -413,6 +430,7 @@ def test_overfitting_IO():
|
||||||
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
|
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
|
||||||
nlp = English()
|
nlp = English()
|
||||||
vector_length = 3
|
vector_length = 3
|
||||||
|
assert "Q2146908" not in nlp.vocab.strings
|
||||||
|
|
||||||
# Convert the texts to docs to make sure we have doc.ents set for the training examples
|
# Convert the texts to docs to make sure we have doc.ents set for the training examples
|
||||||
train_examples = []
|
train_examples = []
|
||||||
|
@ -440,6 +458,9 @@ def test_overfitting_IO():
|
||||||
last=True,
|
last=True,
|
||||||
)
|
)
|
||||||
entity_linker.set_kb(create_kb)
|
entity_linker.set_kb(create_kb)
|
||||||
|
assert "Q2146908" in entity_linker.vocab.strings
|
||||||
|
assert "Q2146908" in entity_linker.kb.vocab.strings
|
||||||
|
assert "Q2146908" in entity_linker.kb._added_strings
|
||||||
|
|
||||||
# train the NEL pipe
|
# train the NEL pipe
|
||||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
@ -474,6 +495,10 @@ def test_overfitting_IO():
|
||||||
nlp.to_disk(tmp_dir)
|
nlp.to_disk(tmp_dir)
|
||||||
nlp2 = util.load_model_from_path(tmp_dir)
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
assert nlp2.pipe_names == nlp.pipe_names
|
assert nlp2.pipe_names == nlp.pipe_names
|
||||||
|
assert "Q2146908" in nlp2.vocab.strings
|
||||||
|
entity_linker2 = nlp2.get_pipe("entity_linker")
|
||||||
|
assert "Q2146908" in entity_linker2.vocab.strings
|
||||||
|
assert "Q2146908" in entity_linker2.kb.vocab.strings
|
||||||
predictions = []
|
predictions = []
|
||||||
for text, annotation in TRAIN_DATA:
|
for text, annotation in TRAIN_DATA:
|
||||||
doc2 = nlp2(text)
|
doc2 = nlp2(text)
|
||||||
|
|
|
@ -66,9 +66,9 @@ def test_initialize_examples():
|
||||||
# you shouldn't really call this more than once, but for testing it should be fine
|
# you shouldn't really call this more than once, but for testing it should be fine
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
nlp.initialize(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(TypeError):
|
||||||
nlp.initialize(get_examples=lambda: None)
|
nlp.initialize(get_examples=lambda: None)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(TypeError):
|
||||||
nlp.initialize(get_examples=train_examples)
|
nlp.initialize(get_examples=train_examples)
|
||||||
|
|
||||||
|
|
||||||
|
@ -101,3 +101,4 @@ def test_overfitting_IO():
|
||||||
doc2 = nlp2(test_text)
|
doc2 = nlp2(test_text)
|
||||||
assert [str(t.morph) for t in doc2] == gold_morphs
|
assert [str(t.morph) for t in doc2] == gold_morphs
|
||||||
assert [t.pos_ for t in doc2] == gold_pos_tags
|
assert [t.pos_ for t in doc2] == gold_pos_tags
|
||||||
|
assert nlp.get_pipe("morphologizer")._added_strings == nlp2.get_pipe("morphologizer")._added_strings
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.pipeline import Pipe
|
from spacy.pipeline import TrainablePipe
|
||||||
from spacy.util import SimpleFrozenList, get_arg_names
|
from spacy.util import SimpleFrozenList, get_arg_names
|
||||||
|
|
||||||
|
|
||||||
|
@ -376,7 +376,7 @@ def test_pipe_label_data_no_labels(pipe):
|
||||||
def test_warning_pipe_begin_training():
|
def test_warning_pipe_begin_training():
|
||||||
with pytest.warns(UserWarning, match="begin_training"):
|
with pytest.warns(UserWarning, match="begin_training"):
|
||||||
|
|
||||||
class IncompatPipe(Pipe):
|
class IncompatPipe(TrainablePipe):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
|
@ -40,9 +40,9 @@ def test_initialize_examples():
|
||||||
# you shouldn't really call this more than once, but for testing it should be fine
|
# you shouldn't really call this more than once, but for testing it should be fine
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
nlp.initialize(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(TypeError):
|
||||||
nlp.initialize(get_examples=lambda: None)
|
nlp.initialize(get_examples=lambda: None)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(TypeError):
|
||||||
nlp.initialize(get_examples=train_examples)
|
nlp.initialize(get_examples=train_examples)
|
||||||
|
|
||||||
|
|
||||||
|
@ -80,3 +80,4 @@ def test_overfitting_IO():
|
||||||
nlp2 = util.load_model_from_path(tmp_dir)
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
doc2 = nlp2(test_text)
|
doc2 = nlp2(test_text)
|
||||||
assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts
|
assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts
|
||||||
|
assert nlp.get_pipe("senter")._added_strings == nlp2.get_pipe("senter")._added_strings
|
||||||
|
|
|
@ -74,13 +74,13 @@ def test_initialize_examples():
|
||||||
# you shouldn't really call this more than once, but for testing it should be fine
|
# you shouldn't really call this more than once, but for testing it should be fine
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
nlp.initialize(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(TypeError):
|
||||||
nlp.initialize(get_examples=lambda: None)
|
nlp.initialize(get_examples=lambda: None)
|
||||||
with pytest.raises(TypeError):
|
with pytest.raises(TypeError):
|
||||||
nlp.initialize(get_examples=lambda: train_examples[0])
|
nlp.initialize(get_examples=lambda: train_examples[0])
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(TypeError):
|
||||||
nlp.initialize(get_examples=lambda: [])
|
nlp.initialize(get_examples=lambda: [])
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(TypeError):
|
||||||
nlp.initialize(get_examples=train_examples)
|
nlp.initialize(get_examples=train_examples)
|
||||||
|
|
||||||
|
|
||||||
|
@ -98,6 +98,7 @@ def test_overfitting_IO():
|
||||||
losses = {}
|
losses = {}
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
assert losses["tagger"] < 0.00001
|
assert losses["tagger"] < 0.00001
|
||||||
|
assert tagger._added_strings == {"J", "N", "V"}
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_text = "I like blue eggs"
|
test_text = "I like blue eggs"
|
||||||
|
@ -116,6 +117,7 @@ def test_overfitting_IO():
|
||||||
assert doc2[1].tag_ is "V"
|
assert doc2[1].tag_ is "V"
|
||||||
assert doc2[2].tag_ is "J"
|
assert doc2[2].tag_ is "J"
|
||||||
assert doc2[3].tag_ is "N"
|
assert doc2[3].tag_ is "N"
|
||||||
|
assert nlp2.get_pipe("tagger")._added_strings == {"J", "N", "V"}
|
||||||
|
|
||||||
|
|
||||||
def test_tagger_requires_labels():
|
def test_tagger_requires_labels():
|
||||||
|
|
|
@ -127,9 +127,9 @@ def test_initialize_examples():
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
get_examples = make_get_examples(nlp)
|
get_examples = make_get_examples(nlp)
|
||||||
nlp.initialize(get_examples=get_examples)
|
nlp.initialize(get_examples=get_examples)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(TypeError):
|
||||||
nlp.initialize(get_examples=lambda: None)
|
nlp.initialize(get_examples=lambda: None)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(TypeError):
|
||||||
nlp.initialize(get_examples=get_examples())
|
nlp.initialize(get_examples=get_examples())
|
||||||
|
|
||||||
|
|
||||||
|
@ -146,6 +146,7 @@ def test_overfitting_IO():
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
assert textcat.model.get_dim("nO") == 2
|
assert textcat.model.get_dim("nO") == 2
|
||||||
|
assert textcat._added_strings == {"NEGATIVE", "POSITIVE"}
|
||||||
|
|
||||||
for i in range(50):
|
for i in range(50):
|
||||||
losses = {}
|
losses = {}
|
||||||
|
@ -167,6 +168,7 @@ def test_overfitting_IO():
|
||||||
cats2 = doc2.cats
|
cats2 = doc2.cats
|
||||||
assert cats2["POSITIVE"] > 0.9
|
assert cats2["POSITIVE"] > 0.9
|
||||||
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001)
|
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001)
|
||||||
|
assert nlp2.get_pipe("textcat")._added_strings == {"NEGATIVE", "POSITIVE"}
|
||||||
|
|
||||||
# Test scoring
|
# Test scoring
|
||||||
scores = nlp.evaluate(train_examples)
|
scores = nlp.evaluate(train_examples)
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.pipeline import Pipe
|
from spacy.pipeline import TrainablePipe
|
||||||
from spacy.matcher import PhraseMatcher, Matcher
|
from spacy.matcher import PhraseMatcher, Matcher
|
||||||
from spacy.tokens import Doc, Span, DocBin
|
from spacy.tokens import Doc, Span, DocBin
|
||||||
from spacy.training import Example, Corpus
|
from spacy.training import Example, Corpus
|
||||||
|
@ -271,7 +271,7 @@ def test_issue4272():
|
||||||
|
|
||||||
|
|
||||||
def test_multiple_predictions():
|
def test_multiple_predictions():
|
||||||
class DummyPipe(Pipe):
|
class DummyPipe(TrainablePipe):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.model = "dummy_model"
|
self.model = "dummy_model"
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
from typing import Callable
|
|
||||||
import warnings
|
import warnings
|
||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -7,8 +6,7 @@ from numpy import zeros
|
||||||
from spacy.kb import KnowledgeBase, Writer
|
from spacy.kb import KnowledgeBase, Writer
|
||||||
from spacy.vectors import Vectors
|
from spacy.vectors import Vectors
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.pipeline import Pipe
|
from spacy.pipeline import TrainablePipe
|
||||||
from spacy.util import registry
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
@ -45,14 +43,13 @@ def custom_pipe():
|
||||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
class MyPipe(Pipe):
|
class MyPipe(TrainablePipe):
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
if cfg:
|
if cfg:
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
else:
|
else:
|
||||||
self.cfg = None
|
self.cfg = None
|
||||||
self.model = SerializableDummy()
|
self.model = SerializableDummy()
|
||||||
self.vocab = SerializableDummy()
|
|
||||||
|
|
||||||
return MyPipe(None)
|
return MyPipe(None)
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy import registry
|
import srsly
|
||||||
|
from spacy import registry, Vocab
|
||||||
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
|
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
|
||||||
from spacy.pipeline import TextCategorizer, SentenceRecognizer
|
from spacy.pipeline import TextCategorizer, SentenceRecognizer
|
||||||
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||||
|
@ -69,6 +70,29 @@ def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
|
||||||
assert bytes_2 == bytes_3
|
assert bytes_2 == bytes_3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("Parser", test_parsers)
|
||||||
|
def test_serialize_parser_strings(Parser):
|
||||||
|
vocab1 = Vocab()
|
||||||
|
label = "FunnyLabel"
|
||||||
|
assert label not in vocab1.strings
|
||||||
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 0,
|
||||||
|
"update_with_oracle_cut_size": 100,
|
||||||
|
}
|
||||||
|
cfg = {"model": DEFAULT_PARSER_MODEL}
|
||||||
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
|
parser1 = Parser(vocab1, model, **config)
|
||||||
|
parser1.add_label(label)
|
||||||
|
assert label in parser1.vocab.strings
|
||||||
|
vocab2 = Vocab()
|
||||||
|
assert label not in vocab2.strings
|
||||||
|
parser2 = Parser(vocab2, model, **config)
|
||||||
|
parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"]))
|
||||||
|
assert parser1._added_strings == parser2._added_strings == {"FunnyLabel"}
|
||||||
|
assert label in parser2.vocab.strings
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("Parser", test_parsers)
|
@pytest.mark.parametrize("Parser", test_parsers)
|
||||||
def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
|
def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
|
||||||
config = {
|
config = {
|
||||||
|
@ -132,6 +156,29 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
|
||||||
assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
|
assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
def test_serialize_tagger_strings(en_vocab, de_vocab, taggers):
|
||||||
|
label = "SomeWeirdLabel"
|
||||||
|
assert label not in en_vocab.strings
|
||||||
|
assert label not in de_vocab.strings
|
||||||
|
tagger = taggers[0]
|
||||||
|
assert label not in tagger.vocab.strings
|
||||||
|
with make_tempdir() as d:
|
||||||
|
# check that custom labels are serialized as part of the component's strings.jsonl
|
||||||
|
tagger.add_label(label)
|
||||||
|
assert label in tagger.vocab.strings
|
||||||
|
assert tagger._added_strings == {label}
|
||||||
|
file_path = d / "tagger1"
|
||||||
|
tagger.to_disk(file_path)
|
||||||
|
strings = srsly.read_json(file_path / "strings.json")
|
||||||
|
assert strings == ["SomeWeirdLabel"]
|
||||||
|
# ensure that the custom strings are loaded back in when using the tagger in another pipeline
|
||||||
|
cfg = {"model": DEFAULT_TAGGER_MODEL}
|
||||||
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
|
tagger2 = Tagger(de_vocab, model).from_disk(file_path)
|
||||||
|
assert label in tagger2.vocab.strings
|
||||||
|
assert tagger2._added_strings == {label}
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_textcat_empty(en_vocab):
|
def test_serialize_textcat_empty(en_vocab):
|
||||||
# See issue #1105
|
# See issue #1105
|
||||||
cfg = {"model": DEFAULT_TEXTCAT_MODEL}
|
cfg = {"model": DEFAULT_TEXTCAT_MODEL}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from .corpus import Corpus # noqa: F401
|
from .corpus import Corpus # noqa: F401
|
||||||
from .example import Example, validate_examples # noqa: F401
|
from .example import Example, validate_examples, validate_get_examples # noqa: F401
|
||||||
from .align import Alignment # noqa: F401
|
from .align import Alignment # noqa: F401
|
||||||
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
|
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
|
||||||
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
|
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
|
||||||
|
|
|
@ -44,6 +44,24 @@ def validate_examples(examples, method):
|
||||||
raise TypeError(err)
|
raise TypeError(err)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_get_examples(get_examples, method):
|
||||||
|
"""Check that a generator of a batch of examples received during processing is valid:
|
||||||
|
the callable produces a non-empty list of Example objects.
|
||||||
|
This function lives here to prevent circular imports.
|
||||||
|
|
||||||
|
get_examples (Callable[[], Iterable[Example]]): A function that produces a batch of examples.
|
||||||
|
method (str): The method name to show in error messages.
|
||||||
|
"""
|
||||||
|
if get_examples is None or not hasattr(get_examples, "__call__"):
|
||||||
|
err = Errors.E930.format(method=method, obj=type(get_examples))
|
||||||
|
raise TypeError(err)
|
||||||
|
examples = get_examples()
|
||||||
|
if not examples:
|
||||||
|
err = Errors.E930.format(method=method, obj=examples)
|
||||||
|
raise TypeError(err)
|
||||||
|
validate_examples(examples, method)
|
||||||
|
|
||||||
|
|
||||||
cdef class Example:
|
cdef class Example:
|
||||||
def __init__(self, Doc predicted, Doc reference, *, alignment=None):
|
def __init__(self, Doc predicted, Doc reference, *, alignment=None):
|
||||||
if predicted is None:
|
if predicted is None:
|
||||||
|
|
|
@ -21,7 +21,7 @@ def console_logger(progress_bar: bool = False):
|
||||||
logged_pipes = [
|
logged_pipes = [
|
||||||
name
|
name
|
||||||
for name, proc in nlp.pipeline
|
for name, proc in nlp.pipeline
|
||||||
if hasattr(proc, "is_trainable") and proc.is_trainable()
|
if hasattr(proc, "is_trainable") and proc.is_trainable
|
||||||
]
|
]
|
||||||
eval_frequency = nlp.config["training"]["eval_frequency"]
|
eval_frequency = nlp.config["training"]["eval_frequency"]
|
||||||
score_weights = nlp.config["training"]["score_weights"]
|
score_weights = nlp.config["training"]["score_weights"]
|
||||||
|
|
|
@ -188,7 +188,7 @@ def train_while_improving(
|
||||||
if (
|
if (
|
||||||
name not in exclude
|
name not in exclude
|
||||||
and hasattr(proc, "is_trainable")
|
and hasattr(proc, "is_trainable")
|
||||||
and proc.is_trainable()
|
and proc.is_trainable
|
||||||
and proc.model not in (True, False, None)
|
and proc.model not in (True, False, None)
|
||||||
):
|
):
|
||||||
proc.finish_update(optimizer)
|
proc.finish_update(optimizer)
|
||||||
|
|
|
@ -1356,3 +1356,16 @@ def check_bool_env_var(env_var: str) -> bool:
|
||||||
if value == "0":
|
if value == "0":
|
||||||
return False
|
return False
|
||||||
return bool(value)
|
return bool(value)
|
||||||
|
|
||||||
|
|
||||||
|
def _pipe(docs, proc, kwargs):
|
||||||
|
if hasattr(proc, "pipe"):
|
||||||
|
yield from proc.pipe(docs, **kwargs)
|
||||||
|
# We added some args for pipe that __call__ doesn't expect.
|
||||||
|
kwargs = dict(kwargs)
|
||||||
|
for arg in ["batch_size"]:
|
||||||
|
if arg in kwargs:
|
||||||
|
kwargs.pop(arg)
|
||||||
|
for doc in docs:
|
||||||
|
doc = proc(doc, **kwargs)
|
||||||
|
yield doc
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
---
|
---
|
||||||
title: Pipe
|
title: TrainablePipe
|
||||||
tag: class
|
tag: class
|
||||||
teaser: Base class for trainable pipeline components
|
teaser: Base class for trainable pipeline components
|
||||||
---
|
---
|
||||||
|
@ -10,30 +10,32 @@ components like the [`EntityRecognizer`](/api/entityrecognizer) or
|
||||||
interface that components should follow to function as trainable components in a
|
interface that components should follow to function as trainable components in a
|
||||||
spaCy pipeline. See the docs on
|
spaCy pipeline. See the docs on
|
||||||
[writing trainable components](/usage/processing-pipelines#trainable-components)
|
[writing trainable components](/usage/processing-pipelines#trainable-components)
|
||||||
for how to use the `Pipe` base class to implement custom components.
|
for how to use the `TrainablePipe` base class to implement custom components.
|
||||||
|
|
||||||
> #### Why is Pipe implemented in Cython?
|
<!-- TODO: Pipe vs TrainablePipe, check methods below (all renamed to TrainablePipe for now) -->
|
||||||
|
|
||||||
|
> #### Why is TrainablePipe implemented in Cython?
|
||||||
>
|
>
|
||||||
> The `Pipe` class is implemented in a `.pyx` module, the extension used by
|
> The `TrainablePipe` class is implemented in a `.pyx` module, the extension
|
||||||
> [Cython](/api/cython). This is needed so that **other** Cython classes, like
|
> used by [Cython](/api/cython). This is needed so that **other** Cython
|
||||||
> the [`EntityRecognizer`](/api/entityrecognizer) can inherit from it. But it
|
> classes, like the [`EntityRecognizer`](/api/entityrecognizer) can inherit from
|
||||||
> doesn't mean you have to implement trainable components in Cython – pure
|
> it. But it doesn't mean you have to implement trainable components in Cython –
|
||||||
> Python components like the [`TextCategorizer`](/api/textcategorizer) can also
|
> pure Python components like the [`TextCategorizer`](/api/textcategorizer) can
|
||||||
> inherit from `Pipe`.
|
> also inherit from `TrainablePipe`.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/pipe.pyx
|
%%GITHUB_SPACY/spacy/pipeline/trainable_pipe.pyx
|
||||||
```
|
```
|
||||||
|
|
||||||
## Pipe.\_\_init\_\_ {#init tag="method"}
|
## TrainablePipe.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.pipeline import Pipe
|
> from spacy.pipeline import TrainablePipe
|
||||||
> from spacy.language import Language
|
> from spacy.language import Language
|
||||||
>
|
>
|
||||||
> class CustomPipe(Pipe):
|
> class CustomPipe(TrainablePipe):
|
||||||
> ...
|
> ...
|
||||||
>
|
>
|
||||||
> @Language.factory("your_custom_pipe", default_config={"model": MODEL})
|
> @Language.factory("your_custom_pipe", default_config={"model": MODEL})
|
||||||
|
@ -45,14 +47,14 @@ Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.add_pipe`](/api/language#create_pipe).
|
[`nlp.add_pipe`](/api/language#create_pipe).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------- | ------------------------------------------------------------------------------------------------------------------------------- |
|
| ------- | -------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], Any]~~ |
|
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], Any]~~ |
|
||||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
| `**cfg` | Additional config parameters and settings. Will be available as the dictionary `Pipe.cfg` and is serialized with the component. |
|
| `**cfg` | Additional config parameters and settings. Will be available as the dictionary `cfg` and is serialized with the component. |
|
||||||
|
|
||||||
## Pipe.\_\_call\_\_ {#call tag="method"}
|
## TrainablePipe.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
Apply the pipe to one document. The document is modified in place, and returned.
|
Apply the pipe to one document. The document is modified in place, and returned.
|
||||||
This usually happens under the hood when the `nlp` object is called on a text
|
This usually happens under the hood when the `nlp` object is called on a text
|
||||||
|
@ -75,7 +77,7 @@ and all pipeline components are applied to the `Doc` in order. Both
|
||||||
| `doc` | The document to process. ~~Doc~~ |
|
| `doc` | The document to process. ~~Doc~~ |
|
||||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||||
|
|
||||||
## Pipe.pipe {#pipe tag="method"}
|
## TrainablePipe.pipe {#pipe tag="method"}
|
||||||
|
|
||||||
Apply the pipe to a stream of documents. This usually happens under the hood
|
Apply the pipe to a stream of documents. This usually happens under the hood
|
||||||
when the `nlp` object is called on a text and all pipeline components are
|
when the `nlp` object is called on a text and all pipeline components are
|
||||||
|
@ -98,7 +100,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## Pipe.initialize {#initialize tag="method" new="3"}
|
## TrainablePipe.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||||
|
@ -128,7 +130,7 @@ This method was previously called `begin_training`.
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
|
||||||
## Pipe.predict {#predict tag="method"}
|
## TrainablePipe.predict {#predict tag="method"}
|
||||||
|
|
||||||
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
|
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
|
||||||
modifying them.
|
modifying them.
|
||||||
|
@ -151,7 +153,7 @@ This method needs to be overwritten with your own custom `predict` method.
|
||||||
| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
|
| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
|
||||||
| **RETURNS** | The model's prediction for each document. |
|
| **RETURNS** | The model's prediction for each document. |
|
||||||
|
|
||||||
## Pipe.set_annotations {#set_annotations tag="method"}
|
## TrainablePipe.set_annotations {#set_annotations tag="method"}
|
||||||
|
|
||||||
Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
|
Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
|
||||||
|
|
||||||
|
@ -175,7 +177,7 @@ method.
|
||||||
| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
|
| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
|
||||||
| `scores` | The scores to set, produced by `Tagger.predict`. |
|
| `scores` | The scores to set, produced by `Tagger.predict`. |
|
||||||
|
|
||||||
## Pipe.update {#update tag="method"}
|
## TrainablePipe.update {#update tag="method"}
|
||||||
|
|
||||||
Learn from a batch of [`Example`](/api/example) objects containing the
|
Learn from a batch of [`Example`](/api/example) objects containing the
|
||||||
predictions and gold-standard annotations, and update the component's model.
|
predictions and gold-standard annotations, and update the component's model.
|
||||||
|
@ -198,7 +200,7 @@ predictions and gold-standard annotations, and update the component's model.
|
||||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
## Pipe.rehearse {#rehearse tag="method,experimental" new="3"}
|
## TrainablePipe.rehearse {#rehearse tag="method,experimental" new="3"}
|
||||||
|
|
||||||
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
||||||
current model to make predictions similar to an initial model, to try to address
|
current model to make predictions similar to an initial model, to try to address
|
||||||
|
@ -216,12 +218,11 @@ the "catastrophic forgetting" problem. This feature is experimental.
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `drop` | The dropout rate. ~~float~~ |
|
|
||||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
## Pipe.get_loss {#get_loss tag="method"}
|
## TrainablePipe.get_loss {#get_loss tag="method"}
|
||||||
|
|
||||||
Find the loss and gradient of loss for the batch of documents and their
|
Find the loss and gradient of loss for the batch of documents and their
|
||||||
predicted scores.
|
predicted scores.
|
||||||
|
@ -246,7 +247,7 @@ This method needs to be overwritten with your own custom `get_loss` method.
|
||||||
| `scores` | Scores representing the model's predictions. |
|
| `scores` | Scores representing the model's predictions. |
|
||||||
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||||
|
|
||||||
## Pipe.score {#score tag="method" new="3"}
|
## TrainablePipe.score {#score tag="method" new="3"}
|
||||||
|
|
||||||
Score a batch of examples.
|
Score a batch of examples.
|
||||||
|
|
||||||
|
@ -261,7 +262,7 @@ Score a batch of examples.
|
||||||
| `examples` | The examples to score. ~~Iterable[Example]~~ |
|
| `examples` | The examples to score. ~~Iterable[Example]~~ |
|
||||||
| **RETURNS** | The scores, e.g. produced by the [`Scorer`](/api/scorer). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
| **RETURNS** | The scores, e.g. produced by the [`Scorer`](/api/scorer). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||||
|
|
||||||
## Pipe.create_optimizer {#create_optimizer tag="method"}
|
## TrainablePipe.create_optimizer {#create_optimizer tag="method"}
|
||||||
|
|
||||||
Create an optimizer for the pipeline component. Defaults to
|
Create an optimizer for the pipeline component. Defaults to
|
||||||
[`Adam`](https://thinc.ai/docs/api-optimizers#adam) with default settings.
|
[`Adam`](https://thinc.ai/docs/api-optimizers#adam) with default settings.
|
||||||
|
@ -277,7 +278,7 @@ Create an optimizer for the pipeline component. Defaults to
|
||||||
| ----------- | ---------------------------- |
|
| ----------- | ---------------------------- |
|
||||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||||
|
|
||||||
## Pipe.use_params {#use_params tag="method, contextmanager"}
|
## TrainablePipe.use_params {#use_params tag="method, contextmanager"}
|
||||||
|
|
||||||
Modify the pipe's model, to use the given parameter values. At the end of the
|
Modify the pipe's model, to use the given parameter values. At the end of the
|
||||||
context, the original parameters are restored.
|
context, the original parameters are restored.
|
||||||
|
@ -294,7 +295,7 @@ context, the original parameters are restored.
|
||||||
| -------- | -------------------------------------------------- |
|
| -------- | -------------------------------------------------- |
|
||||||
| `params` | The parameter values to use in the model. ~~dict~~ |
|
| `params` | The parameter values to use in the model. ~~dict~~ |
|
||||||
|
|
||||||
## Pipe.finish_update {#finish_update tag="method"}
|
## TrainablePipe.finish_update {#finish_update tag="method"}
|
||||||
|
|
||||||
Update parameters using the current parameter gradients. Defaults to calling
|
Update parameters using the current parameter gradients. Defaults to calling
|
||||||
[`self.model.finish_update`](https://thinc.ai/docs/api-model#finish_update).
|
[`self.model.finish_update`](https://thinc.ai/docs/api-model#finish_update).
|
||||||
|
@ -312,7 +313,7 @@ Update parameters using the current parameter gradients. Defaults to calling
|
||||||
| ----- | ------------------------------------- |
|
| ----- | ------------------------------------- |
|
||||||
| `sgd` | An optimizer. ~~Optional[Optimizer]~~ |
|
| `sgd` | An optimizer. ~~Optional[Optimizer]~~ |
|
||||||
|
|
||||||
## Pipe.add_label {#add_label tag="method"}
|
## TrainablePipe.add_label {#add_label tag="method"}
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -347,12 +348,12 @@ case, all labels found in the sample will be automatically added to the model,
|
||||||
and the output dimension will be
|
and the output dimension will be
|
||||||
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
|
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
|
||||||
|
|
||||||
## Pipe.is_resizable {#is_resizable tag="method"}
|
## TrainablePipe.is_resizable {#is_resizable tag="property"}
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> can_resize = pipe.is_resizable()
|
> can_resize = pipe.is_resizable
|
||||||
> ```
|
> ```
|
||||||
>
|
>
|
||||||
> With custom resizing implemented by a component:
|
> With custom resizing implemented by a component:
|
||||||
|
@ -378,7 +379,7 @@ as an attribute to the component's model.
|
||||||
| ----------- | ---------------------------------------------------------------------------------------------- |
|
| ----------- | ---------------------------------------------------------------------------------------------- |
|
||||||
| **RETURNS** | Whether or not the output dimension of the model can be changed after initialization. ~~bool~~ |
|
| **RETURNS** | Whether or not the output dimension of the model can be changed after initialization. ~~bool~~ |
|
||||||
|
|
||||||
## Pipe.set_output {#set_output tag="method"}
|
## TrainablePipe.set_output {#set_output tag="method"}
|
||||||
|
|
||||||
Change the output dimension of the component's model. If the component is not
|
Change the output dimension of the component's model. If the component is not
|
||||||
[resizable](#is_resizable), this method will raise a `NotImplementedError`. If a
|
[resizable](#is_resizable), this method will raise a `NotImplementedError`. If a
|
||||||
|
@ -390,7 +391,7 @@ care should be taken to avoid the "catastrophic forgetting" problem.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> if pipe.is_resizable():
|
> if pipe.is_resizable:
|
||||||
> pipe.set_output(512)
|
> pipe.set_output(512)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -398,7 +399,7 @@ care should be taken to avoid the "catastrophic forgetting" problem.
|
||||||
| ---- | --------------------------------- |
|
| ---- | --------------------------------- |
|
||||||
| `nO` | The new output dimension. ~~int~~ |
|
| `nO` | The new output dimension. ~~int~~ |
|
||||||
|
|
||||||
## Pipe.to_disk {#to_disk tag="method"}
|
## TrainablePipe.to_disk {#to_disk tag="method"}
|
||||||
|
|
||||||
Serialize the pipe to disk.
|
Serialize the pipe to disk.
|
||||||
|
|
||||||
|
@ -415,7 +416,7 @@ Serialize the pipe to disk.
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||||
|
|
||||||
## Pipe.from_disk {#from_disk tag="method"}
|
## TrainablePipe.from_disk {#from_disk tag="method"}
|
||||||
|
|
||||||
Load the pipe from disk. Modifies the object in place and returns it.
|
Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
@ -431,9 +432,9 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||||
| **RETURNS** | The modified pipe. ~~Pipe~~ |
|
| **RETURNS** | The modified pipe. ~~TrainablePipe~~ |
|
||||||
|
|
||||||
## Pipe.to_bytes {#to_bytes tag="method"}
|
## TrainablePipe.to_bytes {#to_bytes tag="method"}
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -450,7 +451,7 @@ Serialize the pipe to a bytestring.
|
||||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||||
| **RETURNS** | The serialized form of the pipe. ~~bytes~~ |
|
| **RETURNS** | The serialized form of the pipe. ~~bytes~~ |
|
||||||
|
|
||||||
## Pipe.from_bytes {#from_bytes tag="method"}
|
## TrainablePipe.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
Load the pipe from a bytestring. Modifies the object in place and returns it.
|
Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
@ -467,16 +468,16 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||||
| **RETURNS** | The pipe. ~~Pipe~~ |
|
| **RETURNS** | The pipe. ~~TrainablePipe~~ |
|
||||||
|
|
||||||
## Attributes {#attributes}
|
## Attributes {#attributes}
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------- | ------------------------------------------------------------------------------------------------------------------------ |
|
| ------- | --------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | The shared vocabulary that's passed in on initialization. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary that's passed in on initialization. ~~Vocab~~ |
|
||||||
| `model` | The model powering the component. ~~Model[List[Doc], Any]~~ |
|
| `model` | The model powering the component. ~~Model[List[Doc], Any]~~ |
|
||||||
| `name` | The name of the component instance in the pipeline. Can be used in the losses. ~~str~~ |
|
| `name` | The name of the component instance in the pipeline. Can be used in the losses. ~~str~~ |
|
||||||
| `cfg` | Keyword arguments passed to [`Pipe.__init__`](/api/pipe#init). Will be serialized with the component. ~~Dict[str, Any]~~ |
|
| `cfg` | Keyword arguments passed to [`TrainablePipe.__init__`](/api/pipe#init). Will be serialized with the component. ~~Dict[str, Any]~~ |
|
||||||
|
|
||||||
## Serialization fields {#serialization-fields}
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
|
@ -487,11 +488,10 @@ serialization by passing in the string names via the `exclude` argument.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> data = pipe.to_disk("/path", exclude=["vocab"])
|
> data = pipe.to_disk("/path")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------- | -------------------------------------------------------------- |
|
| ------- | -------------------------------------------------------------- |
|
||||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
|
||||||
| `cfg` | The config file. You usually don't want to exclude this. |
|
| `cfg` | The config file. You usually don't want to exclude this. |
|
||||||
| `model` | The binary model data. You usually don't want to exclude this. |
|
| `model` | The binary model data. You usually don't want to exclude this. |
|
||||||
|
|
|
@ -57,7 +57,8 @@ components for different language processing tasks and also allows adding
|
||||||
| [`Sentencizer`](/api/sentencizer) | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
|
| [`Sentencizer`](/api/sentencizer) | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
|
||||||
| [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries. |
|
| [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries. |
|
||||||
| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. |
|
| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. |
|
||||||
| [`Pipe`](/api/pipe) | Base class that all trainable pipeline components inherit from. |
|
| [`Pipe`](/api/pipe) | Base class that pipeline components may inherit from. |
|
||||||
|
| [`TrainablePipe`](/api/pipe) | Class that all trainable pipeline components inherit from. |
|
||||||
|
|
||||||
### Matchers {#architecture-matchers}
|
### Matchers {#architecture-matchers}
|
||||||
|
|
||||||
|
|
|
@ -491,13 +491,14 @@ In addition to [swapping out](#swap-architectures) default models in built-in
|
||||||
components, you can also implement an entirely new,
|
components, you can also implement an entirely new,
|
||||||
[trainable](/usage/processing-pipelines#trainable-components) pipeline component
|
[trainable](/usage/processing-pipelines#trainable-components) pipeline component
|
||||||
from scratch. This can be done by creating a new class inheriting from
|
from scratch. This can be done by creating a new class inheriting from
|
||||||
[`Pipe`](/api/pipe), and linking it up to your custom model implementation.
|
[`TrainablePipe`](/api/pipe), and linking it up to your custom model
|
||||||
|
implementation.
|
||||||
|
|
||||||
<Infobox title="Trainable component API" emoji="💡">
|
<Infobox title="Trainable component API" emoji="💡">
|
||||||
|
|
||||||
For details on how to implement pipeline components, check out the usage guide
|
For details on how to implement pipeline components, check out the usage guide
|
||||||
on [custom components](/usage/processing-pipelines#custom-component) and the
|
on [custom components](/usage/processing-pipelines#custom-component) and the
|
||||||
overview of the `Pipe` methods used by
|
overview of the `TrainablePipe` methods used by
|
||||||
[trainable components](/usage/processing-pipelines#trainable-components).
|
[trainable components](/usage/processing-pipelines#trainable-components).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
@ -646,15 +647,15 @@ get_candidates = model.attrs["get_candidates"]
|
||||||
|
|
||||||
To use our new relation extraction model as part of a custom
|
To use our new relation extraction model as part of a custom
|
||||||
[trainable component](/usage/processing-pipelines#trainable-components), we
|
[trainable component](/usage/processing-pipelines#trainable-components), we
|
||||||
create a subclass of [`Pipe`](/api/pipe) that holds the model.
|
create a subclass of [`TrainablePipe`](/api/pipe) that holds the model.
|
||||||
|
|
||||||
![Illustration of Pipe methods](../images/trainable_component.svg)
|
![Illustration of Pipe methods](../images/trainable_component.svg)
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Pipeline component skeleton
|
### Pipeline component skeleton
|
||||||
from spacy.pipeline import Pipe
|
from spacy.pipeline import TrainablePipe
|
||||||
|
|
||||||
class RelationExtractor(Pipe):
|
class RelationExtractor(TrainablePipe):
|
||||||
def __init__(self, vocab, model, name="rel"):
|
def __init__(self, vocab, model, name="rel"):
|
||||||
"""Create a component instance."""
|
"""Create a component instance."""
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -757,9 +758,10 @@ def update(
|
||||||
|
|
||||||
When the internal model is trained, the component can be used to make novel
|
When the internal model is trained, the component can be used to make novel
|
||||||
**predictions**. The [`predict`](/api/pipe#predict) function needs to be
|
**predictions**. The [`predict`](/api/pipe#predict) function needs to be
|
||||||
implemented for each subclass of `Pipe`. In our case, we can simply delegate to
|
implemented for each subclass of `TrainablePipe`. In our case, we can simply
|
||||||
the internal model's [predict](https://thinc.ai/docs/api-model#predict) function
|
delegate to the internal model's
|
||||||
that takes a batch of `Doc` objects and returns a ~~Floats2d~~ array:
|
[predict](https://thinc.ai/docs/api-model#predict) function that takes a batch
|
||||||
|
of `Doc` objects and returns a ~~Floats2d~~ array:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### The predict method
|
### The predict method
|
||||||
|
@ -826,7 +828,7 @@ def __call__(self, Doc doc):
|
||||||
return doc
|
return doc
|
||||||
```
|
```
|
||||||
|
|
||||||
Once our `Pipe` subclass is fully implemented, we can
|
Once our `TrainablePipe` subclass is fully implemented, we can
|
||||||
[register](/usage/processing-pipelines#custom-components-factories) the
|
[register](/usage/processing-pipelines#custom-components-factories) the
|
||||||
component with the [`@Language.factory`](/api/language#factory) decorator. This
|
component with the [`@Language.factory`](/api/language#factory) decorator. This
|
||||||
assigns it a name and lets you create the component with
|
assigns it a name and lets you create the component with
|
||||||
|
|
|
@ -1169,10 +1169,10 @@ doc = nlp("This is a text...")
|
||||||
|
|
||||||
## Trainable components {#trainable-components new="3"}
|
## Trainable components {#trainable-components new="3"}
|
||||||
|
|
||||||
spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable
|
spaCy's [`TrainablePipe`](/api/pipe) class helps you implement your own
|
||||||
components that have their own model instance, make predictions over `Doc`
|
trainable components that have their own model instance, make predictions over
|
||||||
objects and can be updated using [`spacy train`](/api/cli#train). This lets you
|
`Doc` objects and can be updated using [`spacy train`](/api/cli#train). This
|
||||||
plug fully custom machine learning components into your pipeline.
|
lets you plug fully custom machine learning components into your pipeline.
|
||||||
|
|
||||||
![Illustration of Pipe methods](../images/trainable_component.svg)
|
![Illustration of Pipe methods](../images/trainable_component.svg)
|
||||||
|
|
||||||
|
@ -1183,9 +1183,9 @@ You'll need the following:
|
||||||
a [wrapped model](/usage/layers-architectures#frameworks) implemented in
|
a [wrapped model](/usage/layers-architectures#frameworks) implemented in
|
||||||
PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a
|
PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a
|
||||||
list of [`Doc`](/api/doc) objects as input and can have any type of output.
|
list of [`Doc`](/api/doc) objects as input and can have any type of output.
|
||||||
2. **Pipe subclass:** A subclass of [`Pipe`](/api/pipe) that implements at least
|
2. **TrainablePipe subclass:** A subclass of [`TrainablePipe`](/api/pipe) that
|
||||||
two methods: [`Pipe.predict`](/api/pipe#predict) and
|
implements at least two methods: [`TrainablePipe.predict`](/api/pipe#predict)
|
||||||
[`Pipe.set_annotations`](/api/pipe#set_annotations).
|
and [`TrainablePipe.set_annotations`](/api/pipe#set_annotations).
|
||||||
3. **Component factory:** A component factory registered with
|
3. **Component factory:** A component factory registered with
|
||||||
[`@Language.factory`](/api/language#factory) that takes the `nlp` object and
|
[`@Language.factory`](/api/language#factory) that takes the `nlp` object and
|
||||||
component `name` and optional settings provided by the config and returns an
|
component `name` and optional settings provided by the config and returns an
|
||||||
|
@ -1194,10 +1194,10 @@ You'll need the following:
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.pipeline import Pipe
|
> from spacy.pipeline import TrainablePipe
|
||||||
> from spacy.language import Language
|
> from spacy.language import Language
|
||||||
>
|
>
|
||||||
> class TrainableComponent(Pipe):
|
> class TrainableComponent(TrainablePipe):
|
||||||
> def predict(self, docs):
|
> def predict(self, docs):
|
||||||
> ...
|
> ...
|
||||||
>
|
>
|
||||||
|
@ -1214,11 +1214,11 @@ You'll need the following:
|
||||||
| [`predict`](/api/pipe#predict) | Apply the component's model to a batch of [`Doc`](/api/doc) objects (without modifying them) and return the scores. |
|
| [`predict`](/api/pipe#predict) | Apply the component's model to a batch of [`Doc`](/api/doc) objects (without modifying them) and return the scores. |
|
||||||
| [`set_annotations`](/api/pipe#set_annotations) | Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores generated by `predict`. |
|
| [`set_annotations`](/api/pipe#set_annotations) | Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores generated by `predict`. |
|
||||||
|
|
||||||
By default, [`Pipe.__init__`](/api/pipe#init) takes the shared vocab, the
|
By default, [`TrainablePipe.__init__`](/api/pipe#init) takes the shared vocab,
|
||||||
[`Model`](https://thinc.ai/docs/api-model) and the name of the component
|
the [`Model`](https://thinc.ai/docs/api-model) and the name of the component
|
||||||
instance in the pipeline, which you can use as a key in the losses. All other
|
instance in the pipeline, which you can use as a key in the losses. All other
|
||||||
keyword arguments will become available as [`Pipe.cfg`](/api/pipe#cfg) and will
|
keyword arguments will become available as [`TrainablePipe.cfg`](/api/pipe#cfg)
|
||||||
also be serialized with the component.
|
and will also be serialized with the component.
|
||||||
|
|
||||||
<Accordion title="Why components should be passed a Model instance, not create it" spaced>
|
<Accordion title="Why components should be passed a Model instance, not create it" spaced>
|
||||||
|
|
||||||
|
|
|
@ -178,7 +178,8 @@ freely combine implementations from different frameworks into a single model.
|
||||||
- **Thinc: **
|
- **Thinc: **
|
||||||
[Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks),
|
[Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks),
|
||||||
[`Model` API](https://thinc.ai/docs/api-model)
|
[`Model` API](https://thinc.ai/docs/api-model)
|
||||||
- **API:** [Model architectures](/api/architectures), [`Pipe`](/api/pipe)
|
- **API:** [Model architectures](/api/architectures),
|
||||||
|
[`TrainablePipe`](/api/pipe)
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
@ -428,7 +429,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
|
||||||
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
|
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
|
||||||
| [`Language.components`](/api/language#attributes), [`Language.component_names`](/api/language#attributes) | All available components and component names, including disabled components that are not run as part of the pipeline. |
|
| [`Language.components`](/api/language#attributes), [`Language.component_names`](/api/language#attributes) | All available components and component names, including disabled components that are not run as part of the pipeline. |
|
||||||
| [`Language.disabled`](/api/language#attributes) | Names of disabled components that are not run as part of the pipeline. |
|
| [`Language.disabled`](/api/language#attributes) | Names of disabled components that are not run as part of the pipeline. |
|
||||||
| [`Pipe.score`](/api/pipe#score) | Method on pipeline components that returns a dictionary of evaluation scores. |
|
| [`TrainablePipe.score`](/api/pipe#score) | Method on pipeline components that returns a dictionary of evaluation scores. |
|
||||||
| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). |
|
| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). |
|
||||||
| [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a pipeline's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). |
|
| [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a pipeline's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). |
|
||||||
| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. |
|
| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. |
|
||||||
|
@ -483,7 +484,7 @@ format for documenting argument and return types.
|
||||||
[`Morphologizer`](/api/morphologizer),
|
[`Morphologizer`](/api/morphologizer),
|
||||||
[`AttributeRuler`](/api/attributeruler),
|
[`AttributeRuler`](/api/attributeruler),
|
||||||
[`SentenceRecognizer`](/api/sentencerecognizer),
|
[`SentenceRecognizer`](/api/sentencerecognizer),
|
||||||
[`DependencyMatcher`](/api/dependencymatcher), [`Pipe`](/api/pipe),
|
[`DependencyMatcher`](/api/dependencymatcher), [`TrainablePipe`](/api/pipe),
|
||||||
[`Corpus`](/api/corpus)
|
[`Corpus`](/api/corpus)
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
@ -522,7 +523,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
||||||
[`@Language.factory`](/api/language#factory) decorator.
|
[`@Language.factory`](/api/language#factory) decorator.
|
||||||
- The [`Language.update`](/api/language#update),
|
- The [`Language.update`](/api/language#update),
|
||||||
[`Language.evaluate`](/api/language#evaluate) and
|
[`Language.evaluate`](/api/language#evaluate) and
|
||||||
[`Pipe.update`](/api/pipe#update) methods now all take batches of
|
[`TrainablePipe.update`](/api/pipe#update) methods now all take batches of
|
||||||
[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
|
[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
|
||||||
raw text and a dictionary of annotations.
|
raw text and a dictionary of annotations.
|
||||||
- The `begin_training` methods have been renamed to `initialize` and now take a
|
- The `begin_training` methods have been renamed to `initialize` and now take a
|
||||||
|
@ -947,7 +948,7 @@ annotations = {"entities": [(0, 15, "PERSON"), (30, 38, "ORG")]}
|
||||||
|
|
||||||
The [`Language.update`](/api/language#update),
|
The [`Language.update`](/api/language#update),
|
||||||
[`Language.evaluate`](/api/language#evaluate) and
|
[`Language.evaluate`](/api/language#evaluate) and
|
||||||
[`Pipe.update`](/api/pipe#update) methods now all take batches of
|
[`TrainablePipe.update`](/api/pipe#update) methods now all take batches of
|
||||||
[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
|
[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
|
||||||
raw text and a dictionary of annotations.
|
raw text and a dictionary of annotations.
|
||||||
|
|
||||||
|
@ -967,12 +968,13 @@ for i in range(20):
|
||||||
nlp.update(examples)
|
nlp.update(examples)
|
||||||
```
|
```
|
||||||
|
|
||||||
`Language.begin_training` and `Pipe.begin_training` have been renamed to
|
`Language.begin_training` and `TrainablePipe.begin_training` have been renamed
|
||||||
[`Language.initialize`](/api/language#initialize) and
|
to [`Language.initialize`](/api/language#initialize) and
|
||||||
[`Pipe.initialize`](/api/pipe#initialize), and the methods now take a function
|
[`TrainablePipe.initialize`](/api/pipe#initialize), and the methods now take a
|
||||||
that returns a sequence of `Example` objects to initialize the model instead of
|
function that returns a sequence of `Example` objects to initialize the model
|
||||||
a list of tuples. The data examples are used to **initialize the models** of
|
instead of a list of tuples. The data examples are used to **initialize the
|
||||||
trainable pipeline components, which includes validating the network,
|
models** of trainable pipeline components, which includes validating the
|
||||||
|
network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme.
|
setting up the label scheme.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user