mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Merge remote-tracking branch 'upstream/develop' into feature/more-v3-docs
This commit is contained in:
commit
abba639565
|
@ -15,7 +15,8 @@ import spacy.util
|
|||
from bin.ud import conll17_ud_eval
|
||||
from spacy.tokens import Token, Doc
|
||||
from spacy.gold import Example
|
||||
from spacy.util import compounding, minibatch, minibatch_by_words
|
||||
from spacy.util import compounding, minibatch
|
||||
from spacy.gold.batchers import minibatch_by_words
|
||||
from spacy.pipeline._parser_internals.nonproj import projectivize
|
||||
from spacy.matcher import Matcher
|
||||
from spacy import displacy
|
||||
|
|
|
@ -48,8 +48,7 @@ def main(model, output_dir=None):
|
|||
# You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality.
|
||||
# For simplicity, we'll just use the original vector dimension here instead.
|
||||
vectors_dim = nlp.vocab.vectors.shape[1]
|
||||
kb = KnowledgeBase(entity_vector_length=vectors_dim)
|
||||
kb.initialize(nlp.vocab)
|
||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=vectors_dim)
|
||||
|
||||
# set up the data
|
||||
entity_ids = []
|
||||
|
@ -81,7 +80,7 @@ def main(model, output_dir=None):
|
|||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
kb_path = str(output_dir / "kb")
|
||||
kb.dump(kb_path)
|
||||
kb.to_disk(kb_path)
|
||||
print()
|
||||
print("Saved KB to", kb_path)
|
||||
|
||||
|
@ -96,9 +95,8 @@ def main(model, output_dir=None):
|
|||
print("Loading vocab from", vocab_path)
|
||||
print("Loading KB from", kb_path)
|
||||
vocab2 = Vocab().from_disk(vocab_path)
|
||||
kb2 = KnowledgeBase(entity_vector_length=1)
|
||||
kb.initialize(vocab2)
|
||||
kb2.load_bulk(kb_path)
|
||||
kb2 = KnowledgeBase(vocab2, entity_vector_length=1)
|
||||
kb2.from_disk(kb_path)
|
||||
print()
|
||||
_print_kb(kb2)
|
||||
|
||||
|
|
|
@ -83,7 +83,7 @@ def main(kb_path, vocab_path, output_dir=None, n_iter=50):
|
|||
if "entity_linker" not in nlp.pipe_names:
|
||||
print("Loading Knowledge Base from '%s'" % kb_path)
|
||||
cfg = {
|
||||
"kb": {
|
||||
"kb_loader": {
|
||||
"@assets": "spacy.KBFromFile.v1",
|
||||
"vocab_path": vocab_path,
|
||||
"kb_path": kb_path,
|
||||
|
|
|
@ -68,11 +68,12 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
|||
opt = args.pop(0)
|
||||
err = f"Invalid CLI argument '{opt}'"
|
||||
if opt.startswith("--"): # new argument
|
||||
opt = opt.replace("--", "").replace("-", "_")
|
||||
opt = opt.replace("--", "")
|
||||
if "." not in opt:
|
||||
msg.fail(f"{err}: can't override top-level section", exits=1)
|
||||
if "=" in opt: # we have --opt=value
|
||||
opt, value = opt.split("=", 1)
|
||||
opt = opt.replace("-", "_")
|
||||
else:
|
||||
if not args or args[0].startswith("--"): # flag with no value
|
||||
value = "true"
|
||||
|
|
|
@ -229,6 +229,7 @@ if __name__ == '__main__':
|
|||
|
||||
TEMPLATE_MANIFEST = """
|
||||
include meta.json
|
||||
include config.cfg
|
||||
""".strip()
|
||||
|
||||
|
||||
|
|
|
@ -75,7 +75,7 @@ def train(
|
|||
msg.info("Using CPU")
|
||||
msg.info(f"Loading config and nlp from: {config_path}")
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=config_overrides)
|
||||
config = util.load_config(config_path, overrides=config_overrides, interpolate=True)
|
||||
if config.get("training", {}).get("seed") is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
# Use original config here before it's resolved to functions
|
||||
|
|
|
@ -78,10 +78,11 @@ class Warnings:
|
|||
"are currently: {langs}")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
W090 = ("Could not locate any binary .spacy files in path '{path}'.")
|
||||
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
||||
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
||||
W093 = ("Could not find any data to train the {name} on. Is your "
|
||||
"input data correctly formatted ?")
|
||||
"input data correctly formatted?")
|
||||
W094 = ("Model '{model}' ({model_version}) specifies an under-constrained "
|
||||
"spaCy version requirement: {version}. This can lead to compatibility "
|
||||
"problems with older versions, or as new spaCy versions are "
|
||||
|
@ -476,6 +477,10 @@ class Errors:
|
|||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the "
|
||||
"provided argument {loc} is an existing directory.")
|
||||
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
|
||||
"not seem to exist.")
|
||||
E930 = ("Received invalid get_examples callback in {name}.begin_training. "
|
||||
"Expected function that returns an iterable of Example objects but "
|
||||
"got: {obj}")
|
||||
|
@ -503,8 +508,6 @@ class Errors:
|
|||
"not found in pipeline. Available components: {opts}")
|
||||
E945 = ("Can't copy pipeline component '{name}' from source. Expected loaded "
|
||||
"nlp object, but got: {source}")
|
||||
E946 = ("The Vocab for the knowledge base is not initialized. Did you forget to "
|
||||
"call kb.initialize()?")
|
||||
E947 = ("Matcher.add received invalid 'greedy' argument: expected "
|
||||
"a string value from {expected} but got: '{arg}'")
|
||||
E948 = ("Matcher.add received invalid 'patterns' argument: expected "
|
||||
|
@ -600,7 +603,8 @@ class Errors:
|
|||
"\"en_core_web_sm\" will copy the component from that model.\n\n{config}")
|
||||
E985 = ("Can't load model from config file: no 'nlp' section found.\n\n{config}")
|
||||
E986 = ("Could not create any training batches: check your input. "
|
||||
"Perhaps discard_oversize should be set to False ?")
|
||||
"Are the train and dev paths defined? "
|
||||
"Is 'discard_oversize' set appropriately? ")
|
||||
E987 = ("The text of an example training instance is either a Doc or "
|
||||
"a string, but found {type} instead.")
|
||||
E988 = ("Could not parse any training examples. Ensure the data is "
|
||||
|
@ -610,8 +614,6 @@ class Errors:
|
|||
"of the training data in spaCy 3.0 onwards. The 'update' "
|
||||
"function should now be called with a batch of 'Example' "
|
||||
"objects, instead of (text, annotation) tuples. ")
|
||||
E990 = ("An entity linking component needs to be initialized with a "
|
||||
"KnowledgeBase object, but found {type} instead.")
|
||||
E991 = ("The function 'select_pipes' should be called with either a "
|
||||
"'disable' argument to list the names of the pipe components "
|
||||
"that should be disabled, or with an 'enable' argument that "
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
import warnings
|
||||
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
|
||||
from pathlib import Path
|
||||
|
||||
from .. import util
|
||||
from .example import Example
|
||||
from ..errors import Warnings
|
||||
from ..tokens import DocBin, Doc
|
||||
from ..vocab import Vocab
|
||||
|
||||
|
@ -10,6 +12,8 @@ if TYPE_CHECKING:
|
|||
# This lets us add type hints for mypy etc. without causing circular imports
|
||||
from ..language import Language # noqa: F401
|
||||
|
||||
FILE_TYPE = ".spacy"
|
||||
|
||||
|
||||
@util.registry.readers("spacy.Corpus.v1")
|
||||
def create_docbin_reader(
|
||||
|
@ -53,8 +57,9 @@ class Corpus:
|
|||
@staticmethod
|
||||
def walk_corpus(path: Union[str, Path]) -> List[Path]:
|
||||
path = util.ensure_path(path)
|
||||
if not path.is_dir():
|
||||
if not path.is_dir() and path.parts[-1].endswith(FILE_TYPE):
|
||||
return [path]
|
||||
orig_path = path
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
|
@ -66,8 +71,10 @@ class Corpus:
|
|||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif path.parts[-1].endswith(".spacy"):
|
||||
elif path.parts[-1].endswith(FILE_TYPE):
|
||||
locs.append(path)
|
||||
if len(locs) == 0:
|
||||
warnings.warn(Warnings.W090.format(path=orig_path))
|
||||
return locs
|
||||
|
||||
def __call__(self, nlp: "Language") -> Iterator[Example]:
|
||||
|
@ -135,7 +142,7 @@ class Corpus:
|
|||
i = 0
|
||||
for loc in locs:
|
||||
loc = util.ensure_path(loc)
|
||||
if loc.parts[-1].endswith(".spacy"):
|
||||
if loc.parts[-1].endswith(FILE_TYPE):
|
||||
doc_bin = DocBin().from_disk(loc)
|
||||
docs = doc_bin.get_docs(vocab)
|
||||
for doc in docs:
|
||||
|
|
|
@ -140,7 +140,7 @@ cdef class KnowledgeBase:
|
|||
self._entries.push_back(entry)
|
||||
self._aliases_table.push_back(alias)
|
||||
|
||||
cpdef load_bulk(self, loc)
|
||||
cpdef from_disk(self, loc)
|
||||
cpdef set_entities(self, entity_list, freq_list, vector_list)
|
||||
|
||||
|
||||
|
|
54
spacy/kb.pyx
54
spacy/kb.pyx
|
@ -1,4 +1,5 @@
|
|||
# cython: infer_types=True, profile=True
|
||||
from typing import Iterator
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
from cpython.exc cimport PyErr_SetFromErrno
|
||||
|
@ -64,6 +65,16 @@ cdef class Candidate:
|
|||
return self.prior_prob
|
||||
|
||||
|
||||
def get_candidates(KnowledgeBase kb, span) -> Iterator[Candidate]:
|
||||
"""
|
||||
Return candidate entities for a given span by using the text of the span as the alias
|
||||
and fetching appropriate entries from the index.
|
||||
This particular function is optimized to work with the built-in KB functionality,
|
||||
but any other custom candidate generation method can be used in combination with the KB as well.
|
||||
"""
|
||||
return kb.get_alias_candidates(span.text)
|
||||
|
||||
|
||||
cdef class KnowledgeBase:
|
||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
||||
to support entity linking of named entities to real-world concepts.
|
||||
|
@ -71,25 +82,16 @@ cdef class KnowledgeBase:
|
|||
DOCS: https://spacy.io/api/kb
|
||||
"""
|
||||
|
||||
def __init__(self, entity_vector_length):
|
||||
"""Create a KnowledgeBase. Make sure to call kb.initialize() before using it."""
|
||||
def __init__(self, Vocab vocab, entity_vector_length):
|
||||
"""Create a KnowledgeBase."""
|
||||
self.mem = Pool()
|
||||
self.entity_vector_length = entity_vector_length
|
||||
|
||||
self._entry_index = PreshMap()
|
||||
self._alias_index = PreshMap()
|
||||
self.vocab = None
|
||||
|
||||
|
||||
def initialize(self, Vocab vocab):
|
||||
self.vocab = vocab
|
||||
self.vocab.strings.add("")
|
||||
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
|
||||
|
||||
def require_vocab(self):
|
||||
if self.vocab is None:
|
||||
raise ValueError(Errors.E946)
|
||||
|
||||
@property
|
||||
def entity_vector_length(self):
|
||||
"""RETURNS (uint64): length of the entity vectors"""
|
||||
|
@ -102,14 +104,12 @@ cdef class KnowledgeBase:
|
|||
return len(self._entry_index)
|
||||
|
||||
def get_entity_strings(self):
|
||||
self.require_vocab()
|
||||
return [self.vocab.strings[x] for x in self._entry_index]
|
||||
|
||||
def get_size_aliases(self):
|
||||
return len(self._alias_index)
|
||||
|
||||
def get_alias_strings(self):
|
||||
self.require_vocab()
|
||||
return [self.vocab.strings[x] for x in self._alias_index]
|
||||
|
||||
def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
|
||||
|
@ -117,7 +117,6 @@ cdef class KnowledgeBase:
|
|||
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
||||
Return the hash of the entity ID/name at the end.
|
||||
"""
|
||||
self.require_vocab()
|
||||
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
||||
|
||||
# Return if this entity was added before
|
||||
|
@ -140,7 +139,6 @@ cdef class KnowledgeBase:
|
|||
return entity_hash
|
||||
|
||||
cpdef set_entities(self, entity_list, freq_list, vector_list):
|
||||
self.require_vocab()
|
||||
if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list):
|
||||
raise ValueError(Errors.E140)
|
||||
|
||||
|
@ -176,12 +174,10 @@ cdef class KnowledgeBase:
|
|||
i += 1
|
||||
|
||||
def contains_entity(self, unicode entity):
|
||||
self.require_vocab()
|
||||
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
||||
return entity_hash in self._entry_index
|
||||
|
||||
def contains_alias(self, unicode alias):
|
||||
self.require_vocab()
|
||||
cdef hash_t alias_hash = self.vocab.strings.add(alias)
|
||||
return alias_hash in self._alias_index
|
||||
|
||||
|
@ -190,7 +186,6 @@ cdef class KnowledgeBase:
|
|||
For a given alias, add its potential entities and prior probabilies to the KB.
|
||||
Return the alias_hash at the end
|
||||
"""
|
||||
self.require_vocab()
|
||||
# Throw an error if the length of entities and probabilities are not the same
|
||||
if not len(entities) == len(probabilities):
|
||||
raise ValueError(Errors.E132.format(alias=alias,
|
||||
|
@ -234,7 +229,6 @@ cdef class KnowledgeBase:
|
|||
Throw an error if this entity+prior prob would exceed the sum of 1.
|
||||
For efficiency, it's best to use the method `add_alias` as much as possible instead of this one.
|
||||
"""
|
||||
self.require_vocab()
|
||||
# Check if the alias exists in the KB
|
||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||
if not alias_hash in self._alias_index:
|
||||
|
@ -274,14 +268,12 @@ cdef class KnowledgeBase:
|
|||
alias_entry.probs = probs
|
||||
self._aliases_table[alias_index] = alias_entry
|
||||
|
||||
|
||||
def get_candidates(self, unicode alias):
|
||||
def get_alias_candidates(self, unicode alias) -> Iterator[Candidate]:
|
||||
"""
|
||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
If the alias is not known in the KB, and empty list is returned.
|
||||
"""
|
||||
self.require_vocab()
|
||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||
if not alias_hash in self._alias_index:
|
||||
return []
|
||||
|
@ -298,7 +290,6 @@ cdef class KnowledgeBase:
|
|||
if entry_index != 0]
|
||||
|
||||
def get_vector(self, unicode entity):
|
||||
self.require_vocab()
|
||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||
|
||||
# Return an empty list if this entity is unknown in this KB
|
||||
|
@ -311,7 +302,6 @@ cdef class KnowledgeBase:
|
|||
def get_prior_prob(self, unicode entity, unicode alias):
|
||||
""" Return the prior probability of a given alias being linked to a given entity,
|
||||
or return 0.0 when this combination is not known in the knowledge base"""
|
||||
self.require_vocab()
|
||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||
|
||||
|
@ -329,8 +319,7 @@ cdef class KnowledgeBase:
|
|||
return 0.0
|
||||
|
||||
|
||||
def dump(self, loc):
|
||||
self.require_vocab()
|
||||
def to_disk(self, loc):
|
||||
cdef Writer writer = Writer(loc)
|
||||
writer.write_header(self.get_size_entities(), self.entity_vector_length)
|
||||
|
||||
|
@ -370,7 +359,7 @@ cdef class KnowledgeBase:
|
|||
|
||||
writer.close()
|
||||
|
||||
cpdef load_bulk(self, loc):
|
||||
cpdef from_disk(self, loc):
|
||||
cdef hash_t entity_hash
|
||||
cdef hash_t alias_hash
|
||||
cdef int64_t entry_index
|
||||
|
@ -462,12 +451,11 @@ cdef class KnowledgeBase:
|
|||
|
||||
cdef class Writer:
|
||||
def __init__(self, object loc):
|
||||
if path.exists(loc):
|
||||
assert not path.isdir(loc), f"{loc} is directory"
|
||||
if isinstance(loc, Path):
|
||||
loc = bytes(loc)
|
||||
if path.exists(loc):
|
||||
assert not path.isdir(loc), "%s is directory." % loc
|
||||
if path.isdir(loc):
|
||||
raise ValueError(Errors.E928.format(loc=loc))
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
self._fp = fopen(<char*>bytes_loc, 'wb')
|
||||
if not self._fp:
|
||||
|
@ -511,8 +499,10 @@ cdef class Reader:
|
|||
def __init__(self, object loc):
|
||||
if isinstance(loc, Path):
|
||||
loc = bytes(loc)
|
||||
assert path.exists(loc)
|
||||
assert not path.isdir(loc)
|
||||
if not path.exists(loc):
|
||||
raise ValueError(Errors.E929.format(loc=loc))
|
||||
if path.isdir(loc):
|
||||
raise ValueError(Errors.E928.format(loc=loc))
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
self._fp = fopen(<char*>bytes_loc, 'rb')
|
||||
if not self._fp:
|
||||
|
|
|
@ -772,9 +772,9 @@ class Language:
|
|||
self.remove_pipe(name)
|
||||
if not len(self.pipeline) or pipe_index == len(self.pipeline):
|
||||
# we have no components to insert before/after, or we're replacing the last component
|
||||
self.add_pipe(factory_name, name=name)
|
||||
self.add_pipe(factory_name, name=name, config=config, validate=validate)
|
||||
else:
|
||||
self.add_pipe(factory_name, name=name, before=pipe_index)
|
||||
self.add_pipe(factory_name, name=name, before=pipe_index, config=config, validate=validate)
|
||||
|
||||
def rename_pipe(self, old_name: str, new_name: str) -> None:
|
||||
"""Rename a pipeline component.
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable, Iterable
|
||||
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
||||
from thinc.api import Model, Maxout, Linear
|
||||
|
||||
from ...util import registry
|
||||
from ...kb import KnowledgeBase
|
||||
from ...kb import KnowledgeBase, Candidate, get_candidates
|
||||
from ...vocab import Vocab
|
||||
|
||||
|
||||
|
@ -25,15 +25,21 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
|
|||
|
||||
|
||||
@registry.assets.register("spacy.KBFromFile.v1")
|
||||
def load_kb(vocab_path: str, kb_path: str) -> KnowledgeBase:
|
||||
vocab = Vocab().from_disk(vocab_path)
|
||||
kb = KnowledgeBase(entity_vector_length=1)
|
||||
kb.initialize(vocab)
|
||||
kb.load_bulk(kb_path)
|
||||
return kb
|
||||
def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
|
||||
def kb_from_file(vocab):
|
||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||
kb.from_disk(kb_path)
|
||||
return kb
|
||||
return kb_from_file
|
||||
|
||||
|
||||
@registry.assets.register("spacy.EmptyKB.v1")
|
||||
def empty_kb(entity_vector_length: int) -> KnowledgeBase:
|
||||
kb = KnowledgeBase(entity_vector_length=entity_vector_length)
|
||||
return kb
|
||||
def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
||||
def empty_kb_factory(vocab):
|
||||
return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
|
||||
return empty_kb_factory
|
||||
|
||||
|
||||
@registry.assets.register("spacy.CandidateGenerator.v1")
|
||||
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
|
||||
return get_candidates
|
||||
|
|
|
@ -6,7 +6,7 @@ from thinc.api import CosineDistance, get_array_module, Model, Optimizer, Config
|
|||
from thinc.api import set_dropout_rate
|
||||
import warnings
|
||||
|
||||
from ..kb import KnowledgeBase
|
||||
from ..kb import KnowledgeBase, Candidate
|
||||
from ..tokens import Doc
|
||||
from .pipe import Pipe, deserialize_config
|
||||
from ..language import Language
|
||||
|
@ -32,35 +32,30 @@ subword_features = true
|
|||
"""
|
||||
DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
||||
|
||||
default_kb_config = """
|
||||
[kb]
|
||||
@assets = "spacy.EmptyKB.v1"
|
||||
entity_vector_length = 64
|
||||
"""
|
||||
DEFAULT_NEL_KB = Config().from_str(default_kb_config)["kb"]
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"entity_linker",
|
||||
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
||||
assigns=["token.ent_kb_id"],
|
||||
default_config={
|
||||
"kb": DEFAULT_NEL_KB,
|
||||
"kb_loader": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 64},
|
||||
"model": DEFAULT_NEL_MODEL,
|
||||
"labels_discard": [],
|
||||
"incl_prior": True,
|
||||
"incl_context": True,
|
||||
"get_candidates": {"@assets": "spacy.CandidateGenerator.v1"},
|
||||
},
|
||||
)
|
||||
def make_entity_linker(
|
||||
nlp: Language,
|
||||
name: str,
|
||||
model: Model,
|
||||
kb: KnowledgeBase,
|
||||
kb_loader: Callable[[Vocab], KnowledgeBase],
|
||||
*,
|
||||
labels_discard: Iterable[str],
|
||||
incl_prior: bool,
|
||||
incl_context: bool,
|
||||
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
|
||||
):
|
||||
"""Construct an EntityLinker component.
|
||||
|
||||
|
@ -76,10 +71,11 @@ def make_entity_linker(
|
|||
nlp.vocab,
|
||||
model,
|
||||
name,
|
||||
kb=kb,
|
||||
kb_loader=kb_loader,
|
||||
labels_discard=labels_discard,
|
||||
incl_prior=incl_prior,
|
||||
incl_context=incl_context,
|
||||
get_candidates=get_candidates,
|
||||
)
|
||||
|
||||
|
||||
|
@ -97,10 +93,11 @@ class EntityLinker(Pipe):
|
|||
model: Model,
|
||||
name: str = "entity_linker",
|
||||
*,
|
||||
kb: KnowledgeBase,
|
||||
kb_loader: Callable[[Vocab], KnowledgeBase],
|
||||
labels_discard: Iterable[str],
|
||||
incl_prior: bool,
|
||||
incl_context: bool,
|
||||
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
|
||||
) -> None:
|
||||
"""Initialize an entity linker.
|
||||
|
||||
|
@ -108,7 +105,7 @@ class EntityLinker(Pipe):
|
|||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
kb (KnowledgeBase): The KnowledgeBase holding all entities and their aliases.
|
||||
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
|
||||
labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
|
||||
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
||||
incl_context (bool): Whether or not to include the local context in the model.
|
||||
|
@ -119,17 +116,12 @@ class EntityLinker(Pipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
cfg = {
|
||||
"kb": kb,
|
||||
"labels_discard": list(labels_discard),
|
||||
"incl_prior": incl_prior,
|
||||
"incl_context": incl_context,
|
||||
}
|
||||
if not isinstance(kb, KnowledgeBase):
|
||||
raise ValueError(Errors.E990.format(type=type(self.kb)))
|
||||
kb.initialize(vocab)
|
||||
self.kb = kb
|
||||
if "kb" in cfg:
|
||||
del cfg["kb"] # we don't want to duplicate its serialization
|
||||
self.kb = kb_loader(self.vocab)
|
||||
self.get_candidates = get_candidates
|
||||
self.cfg = dict(cfg)
|
||||
self.distance = CosineDistance(normalize=False)
|
||||
# how many neightbour sentences to take into account
|
||||
|
@ -326,10 +318,11 @@ class EntityLinker(Pipe):
|
|||
end_token = sentences[end_sentence].end
|
||||
sent_doc = doc[start_token:end_token].as_doc()
|
||||
# currently, the context is the same for each entity in a sentence (should be refined)
|
||||
sentence_encoding = self.model.predict([sent_doc])[0]
|
||||
xp = get_array_module(sentence_encoding)
|
||||
sentence_encoding_t = sentence_encoding.T
|
||||
sentence_norm = xp.linalg.norm(sentence_encoding_t)
|
||||
xp = self.model.ops.xp
|
||||
if self.cfg.get("incl_context"):
|
||||
sentence_encoding = self.model.predict([sent_doc])[0]
|
||||
sentence_encoding_t = sentence_encoding.T
|
||||
sentence_norm = xp.linalg.norm(sentence_encoding_t)
|
||||
for ent in sent.ents:
|
||||
entity_count += 1
|
||||
to_discard = self.cfg.get("labels_discard", [])
|
||||
|
@ -337,7 +330,7 @@ class EntityLinker(Pipe):
|
|||
# ignoring this entity - setting to NIL
|
||||
final_kb_ids.append(self.NIL)
|
||||
else:
|
||||
candidates = self.kb.get_candidates(ent.text)
|
||||
candidates = self.get_candidates(self.kb, ent)
|
||||
if not candidates:
|
||||
# no prediction possible for this entity - setting to NIL
|
||||
final_kb_ids.append(self.NIL)
|
||||
|
@ -421,10 +414,9 @@ class EntityLinker(Pipe):
|
|||
DOCS: https://spacy.io/api/entitylinker#to_disk
|
||||
"""
|
||||
serialize = {}
|
||||
self.cfg["entity_width"] = self.kb.entity_vector_length
|
||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||
serialize["kb"] = lambda p: self.kb.dump(p)
|
||||
serialize["kb"] = lambda p: self.kb.to_disk(p)
|
||||
serialize["model"] = lambda p: self.model.to_disk(p)
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
|
@ -446,15 +438,10 @@ class EntityLinker(Pipe):
|
|||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
def load_kb(p):
|
||||
self.kb = KnowledgeBase(entity_vector_length=self.cfg["entity_width"])
|
||||
self.kb.initialize(self.vocab)
|
||||
self.kb.load_bulk(p)
|
||||
|
||||
deserialize = {}
|
||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
||||
deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
|
||||
deserialize["kb"] = load_kb
|
||||
deserialize["kb"] = lambda p: self.kb.from_disk(p)
|
||||
deserialize["model"] = load_model
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
|
|
@ -68,7 +68,6 @@ class Tagger(Pipe):
|
|||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
labels (List): The set of labels. Defaults to None.
|
||||
set_morphology (bool): Whether to set morphological features.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#init
|
||||
"""
|
||||
|
|
|
@ -167,18 +167,20 @@ class ModelMetaSchema(BaseModel):
|
|||
lang: StrictStr = Field(..., title="Two-letter language code, e.g. 'en'")
|
||||
name: StrictStr = Field(..., title="Model name")
|
||||
version: StrictStr = Field(..., title="Model version")
|
||||
spacy_version: Optional[StrictStr] = Field(None, title="Compatible spaCy version identifier")
|
||||
parent_package: Optional[StrictStr] = Field("spacy", title="Name of parent spaCy package, e.g. spacy or spacy-nightly")
|
||||
pipeline: Optional[List[StrictStr]] = Field([], title="Names of pipeline components")
|
||||
description: Optional[StrictStr] = Field(None, title="Model description")
|
||||
license: Optional[StrictStr] = Field(None, title="Model license")
|
||||
author: Optional[StrictStr] = Field(None, title="Model author name")
|
||||
email: Optional[StrictStr] = Field(None, title="Model author email")
|
||||
url: Optional[StrictStr] = Field(None, title="Model author URL")
|
||||
sources: Optional[Union[List[StrictStr], Dict[str, str]]] = Field(None, title="Training data sources")
|
||||
vectors: Optional[Dict[str, Any]] = Field(None, title="Included word vectors")
|
||||
accuracy: Optional[Dict[str, Union[float, int]]] = Field(None, title="Accuracy numbers")
|
||||
speed: Optional[Dict[str, Union[float, int]]] = Field(None, title="Speed evaluation numbers")
|
||||
spacy_version: StrictStr = Field("", title="Compatible spaCy version identifier")
|
||||
parent_package: StrictStr = Field("spacy", title="Name of parent spaCy package, e.g. spacy or spacy-nightly")
|
||||
pipeline: List[StrictStr] = Field([], title="Names of pipeline components")
|
||||
description: StrictStr = Field("", title="Model description")
|
||||
license: StrictStr = Field("", title="Model license")
|
||||
author: StrictStr = Field("", title="Model author name")
|
||||
email: StrictStr = Field("", title="Model author email")
|
||||
url: StrictStr = Field("", title="Model author URL")
|
||||
sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
|
||||
vectors: Dict[str, Any] = Field({}, title="Included word vectors")
|
||||
labels: Dict[str, Dict[str, List[str]]] = Field({}, title="Component labels, keyed by component name")
|
||||
accuracy: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy numbers")
|
||||
speed: Dict[str, Union[float, int]] = Field({}, title="Speed evaluation numbers")
|
||||
spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
|
||||
# fmt: on
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from typing import Callable, Iterable
|
||||
import pytest
|
||||
|
||||
from spacy.kb import KnowledgeBase
|
||||
from spacy.kb import KnowledgeBase, get_candidates, Candidate
|
||||
|
||||
from spacy import util, registry
|
||||
from spacy.gold import Example
|
||||
|
@ -21,8 +22,7 @@ def assert_almost_equal(a, b):
|
|||
|
||||
def test_kb_valid_entities(nlp):
|
||||
"""Test the valid construction of a KB with 3 entities and two aliases"""
|
||||
mykb = KnowledgeBase(entity_vector_length=3)
|
||||
mykb.initialize(nlp.vocab)
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[8, 4, 3])
|
||||
|
@ -51,8 +51,7 @@ def test_kb_valid_entities(nlp):
|
|||
|
||||
def test_kb_invalid_entities(nlp):
|
||||
"""Test the invalid construction of a KB with an alias linked to a non-existing entity"""
|
||||
mykb = KnowledgeBase(entity_vector_length=1)
|
||||
mykb.initialize(nlp.vocab)
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
||||
|
@ -68,8 +67,7 @@ def test_kb_invalid_entities(nlp):
|
|||
|
||||
def test_kb_invalid_probabilities(nlp):
|
||||
"""Test the invalid construction of a KB with wrong prior probabilities"""
|
||||
mykb = KnowledgeBase(entity_vector_length=1)
|
||||
mykb.initialize(nlp.vocab)
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
||||
|
@ -83,8 +81,7 @@ def test_kb_invalid_probabilities(nlp):
|
|||
|
||||
def test_kb_invalid_combination(nlp):
|
||||
"""Test the invalid construction of a KB with non-matching entity and probability lists"""
|
||||
mykb = KnowledgeBase(entity_vector_length=1)
|
||||
mykb.initialize(nlp.vocab)
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
||||
|
@ -100,8 +97,7 @@ def test_kb_invalid_combination(nlp):
|
|||
|
||||
def test_kb_invalid_entity_vector(nlp):
|
||||
"""Test the invalid construction of a KB with non-matching entity vector lengths"""
|
||||
mykb = KnowledgeBase(entity_vector_length=3)
|
||||
mykb.initialize(nlp.vocab)
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1, 2, 3])
|
||||
|
@ -117,14 +113,14 @@ def test_kb_default(nlp):
|
|||
assert len(entity_linker.kb) == 0
|
||||
assert entity_linker.kb.get_size_entities() == 0
|
||||
assert entity_linker.kb.get_size_aliases() == 0
|
||||
# default value from pipeline.entity_linker
|
||||
# 64 is the default value from pipeline.entity_linker
|
||||
assert entity_linker.kb.entity_vector_length == 64
|
||||
|
||||
|
||||
def test_kb_custom_length(nlp):
|
||||
"""Test that the default (empty) KB can be configured with a custom entity length"""
|
||||
entity_linker = nlp.add_pipe(
|
||||
"entity_linker", config={"kb": {"entity_vector_length": 35}}
|
||||
"entity_linker", config={"kb_loader": {"entity_vector_length": 35}}
|
||||
)
|
||||
assert len(entity_linker.kb) == 0
|
||||
assert entity_linker.kb.get_size_entities() == 0
|
||||
|
@ -141,7 +137,7 @@ def test_kb_undefined(nlp):
|
|||
|
||||
def test_kb_empty(nlp):
|
||||
"""Test that the EL can't train with an empty KB"""
|
||||
config = {"kb": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 342}}
|
||||
config = {"kb_loader": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 342}}
|
||||
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||
assert len(entity_linker.kb) == 0
|
||||
with pytest.raises(ValueError):
|
||||
|
@ -150,8 +146,13 @@ def test_kb_empty(nlp):
|
|||
|
||||
def test_candidate_generation(nlp):
|
||||
"""Test correct candidate generation"""
|
||||
mykb = KnowledgeBase(entity_vector_length=1)
|
||||
mykb.initialize(nlp.vocab)
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
doc = nlp("douglas adam Adam shrubbery")
|
||||
|
||||
douglas_ent = doc[0:1]
|
||||
adam_ent = doc[1:2]
|
||||
Adam_ent = doc[2:3]
|
||||
shrubbery_ent = doc[3:4]
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||
|
@ -163,21 +164,76 @@ def test_candidate_generation(nlp):
|
|||
mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||
|
||||
# test the size of the relevant candidates
|
||||
assert len(mykb.get_candidates("douglas")) == 2
|
||||
assert len(mykb.get_candidates("adam")) == 1
|
||||
assert len(mykb.get_candidates("shrubbery")) == 0
|
||||
assert len(get_candidates(mykb, douglas_ent)) == 2
|
||||
assert len(get_candidates(mykb, adam_ent)) == 1
|
||||
assert len(get_candidates(mykb, Adam_ent)) == 0 # default case sensitive
|
||||
assert len(get_candidates(mykb, shrubbery_ent)) == 0
|
||||
|
||||
# test the content of the candidates
|
||||
assert mykb.get_candidates("adam")[0].entity_ == "Q2"
|
||||
assert mykb.get_candidates("adam")[0].alias_ == "adam"
|
||||
assert_almost_equal(mykb.get_candidates("adam")[0].entity_freq, 12)
|
||||
assert_almost_equal(mykb.get_candidates("adam")[0].prior_prob, 0.9)
|
||||
assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2"
|
||||
assert get_candidates(mykb, adam_ent)[0].alias_ == "adam"
|
||||
assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12)
|
||||
assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9)
|
||||
|
||||
|
||||
def test_el_pipe_configuration(nlp):
|
||||
"""Test correct candidate generation as part of the EL pipe"""
|
||||
nlp.add_pipe("sentencizer")
|
||||
pattern = {"label": "PERSON", "pattern": [{"LOWER": "douglas"}]}
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns([pattern])
|
||||
|
||||
@registry.assets.register("myAdamKB.v1")
|
||||
def mykb() -> Callable[["Vocab"], KnowledgeBase]:
|
||||
def create_kb(vocab):
|
||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||
kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
||||
kb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
||||
kb.add_alias(
|
||||
alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]
|
||||
)
|
||||
return kb
|
||||
|
||||
return create_kb
|
||||
|
||||
# run an EL pipe without a trained context encoder, to check the candidate generation step only
|
||||
nlp.add_pipe(
|
||||
"entity_linker",
|
||||
config={"kb_loader": {"@assets": "myAdamKB.v1"}, "incl_context": False},
|
||||
)
|
||||
# With the default get_candidates function, matching is case-sensitive
|
||||
text = "Douglas and douglas are not the same."
|
||||
doc = nlp(text)
|
||||
assert doc[0].ent_kb_id_ == "NIL"
|
||||
assert doc[1].ent_kb_id_ == ""
|
||||
assert doc[2].ent_kb_id_ == "Q2"
|
||||
|
||||
def get_lowercased_candidates(kb, span):
|
||||
return kb.get_alias_candidates(span.text.lower())
|
||||
|
||||
@registry.assets.register("spacy.LowercaseCandidateGenerator.v1")
|
||||
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
|
||||
return get_lowercased_candidates
|
||||
|
||||
# replace the pipe with a new one with with a different candidate generator
|
||||
nlp.replace_pipe(
|
||||
"entity_linker",
|
||||
"entity_linker",
|
||||
config={
|
||||
"kb_loader": {"@assets": "myAdamKB.v1"},
|
||||
"incl_context": False,
|
||||
"get_candidates": {"@assets": "spacy.LowercaseCandidateGenerator.v1"},
|
||||
},
|
||||
)
|
||||
doc = nlp(text)
|
||||
assert doc[0].ent_kb_id_ == "Q2"
|
||||
assert doc[1].ent_kb_id_ == ""
|
||||
assert doc[2].ent_kb_id_ == "Q2"
|
||||
|
||||
|
||||
def test_append_alias(nlp):
|
||||
"""Test that we can append additional alias-entity pairs"""
|
||||
mykb = KnowledgeBase(entity_vector_length=1)
|
||||
mykb.initialize(nlp.vocab)
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||
|
@ -189,26 +245,25 @@ def test_append_alias(nlp):
|
|||
mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||
|
||||
# test the size of the relevant candidates
|
||||
assert len(mykb.get_candidates("douglas")) == 2
|
||||
assert len(mykb.get_alias_candidates("douglas")) == 2
|
||||
|
||||
# append an alias
|
||||
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
|
||||
|
||||
# test the size of the relevant candidates has been incremented
|
||||
assert len(mykb.get_candidates("douglas")) == 3
|
||||
assert len(mykb.get_alias_candidates("douglas")) == 3
|
||||
|
||||
# append the same alias-entity pair again should not work (will throw a warning)
|
||||
with pytest.warns(UserWarning):
|
||||
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
|
||||
|
||||
# test the size of the relevant candidates remained unchanged
|
||||
assert len(mykb.get_candidates("douglas")) == 3
|
||||
assert len(mykb.get_alias_candidates("douglas")) == 3
|
||||
|
||||
|
||||
def test_append_invalid_alias(nlp):
|
||||
"""Test that append an alias will throw an error if prior probs are exceeding 1"""
|
||||
mykb = KnowledgeBase(entity_vector_length=1)
|
||||
mykb.initialize(nlp.vocab)
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||
|
@ -228,16 +283,18 @@ def test_preserving_links_asdoc(nlp):
|
|||
"""Test that Span.as_doc preserves the existing entity links"""
|
||||
|
||||
@registry.assets.register("myLocationsKB.v1")
|
||||
def dummy_kb() -> KnowledgeBase:
|
||||
mykb = KnowledgeBase(entity_vector_length=1)
|
||||
mykb.initialize(nlp.vocab)
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
||||
mykb.add_entity(entity="Q2", freq=8, entity_vector=[1])
|
||||
# adding aliases
|
||||
mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7])
|
||||
mykb.add_alias(alias="Denver", entities=["Q2"], probabilities=[0.6])
|
||||
return mykb
|
||||
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
|
||||
def create_kb(vocab):
|
||||
mykb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
||||
mykb.add_entity(entity="Q2", freq=8, entity_vector=[1])
|
||||
# adding aliases
|
||||
mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7])
|
||||
mykb.add_alias(alias="Denver", entities=["Q2"], probabilities=[0.6])
|
||||
return mykb
|
||||
|
||||
return create_kb
|
||||
|
||||
# set up pipeline with NER (Entity Ruler) and NEL (prior probability only, model not trained)
|
||||
nlp.add_pipe("sentencizer")
|
||||
|
@ -247,7 +304,7 @@ def test_preserving_links_asdoc(nlp):
|
|||
]
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
el_config = {"kb": {"@assets": "myLocationsKB.v1"}, "incl_prior": False}
|
||||
el_config = {"kb_loader": {"@assets": "myLocationsKB.v1"}, "incl_prior": False}
|
||||
el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True)
|
||||
el_pipe.begin_training(lambda: [])
|
||||
el_pipe.incl_context = False
|
||||
|
@ -331,24 +388,28 @@ def test_overfitting_IO():
|
|||
train_examples.append(Example.from_dict(doc, annotation))
|
||||
|
||||
@registry.assets.register("myOverfittingKB.v1")
|
||||
def dummy_kb() -> KnowledgeBase:
|
||||
# create artificial KB - assign same prior weight to the two russ cochran's
|
||||
# Q2146908 (Russ Cochran): American golfer
|
||||
# Q7381115 (Russ Cochran): publisher
|
||||
mykb = KnowledgeBase(entity_vector_length=3)
|
||||
mykb.initialize(nlp.vocab)
|
||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
||||
mykb.add_alias(
|
||||
alias="Russ Cochran",
|
||||
entities=["Q2146908", "Q7381115"],
|
||||
probabilities=[0.5, 0.5],
|
||||
)
|
||||
return mykb
|
||||
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
|
||||
def create_kb(vocab):
|
||||
# create artificial KB - assign same prior weight to the two russ cochran's
|
||||
# Q2146908 (Russ Cochran): American golfer
|
||||
# Q7381115 (Russ Cochran): publisher
|
||||
mykb = KnowledgeBase(vocab, entity_vector_length=3)
|
||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
||||
mykb.add_alias(
|
||||
alias="Russ Cochran",
|
||||
entities=["Q2146908", "Q7381115"],
|
||||
probabilities=[0.5, 0.5],
|
||||
)
|
||||
return mykb
|
||||
|
||||
return create_kb
|
||||
|
||||
# Create the Entity Linker component and add it to the pipeline
|
||||
nlp.add_pipe(
|
||||
"entity_linker", config={"kb": {"@assets": "myOverfittingKB.v1"}}, last=True
|
||||
"entity_linker",
|
||||
config={"kb_loader": {"@assets": "myOverfittingKB.v1"}},
|
||||
last=True,
|
||||
)
|
||||
|
||||
# train the NEL pipe
|
||||
|
|
|
@ -78,6 +78,14 @@ def test_replace_last_pipe(nlp):
|
|||
assert nlp.pipe_names == ["sentencizer", "ner"]
|
||||
|
||||
|
||||
def test_replace_pipe_config(nlp):
|
||||
nlp.add_pipe("entity_linker")
|
||||
nlp.add_pipe("sentencizer")
|
||||
assert nlp.get_pipe("entity_linker").cfg["incl_prior"] == True
|
||||
nlp.replace_pipe("entity_linker", "entity_linker", config={"incl_prior": False})
|
||||
assert nlp.get_pipe("entity_linker").cfg["incl_prior"] == False
|
||||
|
||||
|
||||
@pytest.mark.parametrize("old_name,new_name", [("old_pipe", "new_pipe")])
|
||||
def test_rename_pipe(nlp, old_name, new_name):
|
||||
with pytest.raises(ValueError):
|
||||
|
|
|
@ -139,8 +139,7 @@ def test_issue4665():
|
|||
def test_issue4674():
|
||||
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
||||
nlp = English()
|
||||
kb = KnowledgeBase(entity_vector_length=3)
|
||||
kb.initialize(nlp.vocab)
|
||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
vector1 = [0.9, 1.1, 1.01]
|
||||
vector2 = [1.8, 2.25, 2.01]
|
||||
with pytest.warns(UserWarning):
|
||||
|
@ -156,10 +155,9 @@ def test_issue4674():
|
|||
if not dir_path.exists():
|
||||
dir_path.mkdir()
|
||||
file_path = dir_path / "kb"
|
||||
kb.dump(str(file_path))
|
||||
kb2 = KnowledgeBase(entity_vector_length=3)
|
||||
kb2.initialize(nlp.vocab)
|
||||
kb2.load_bulk(str(file_path))
|
||||
kb.to_disk(str(file_path))
|
||||
kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
kb2.from_disk(str(file_path))
|
||||
assert kb2.get_size_entities() == 1
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from typing import Callable
|
||||
import warnings
|
||||
from unittest import TestCase
|
||||
import pytest
|
||||
|
@ -70,13 +71,14 @@ def entity_linker():
|
|||
nlp = Language()
|
||||
|
||||
@registry.assets.register("TestIssue5230KB.v1")
|
||||
def dummy_kb() -> KnowledgeBase:
|
||||
kb = KnowledgeBase(entity_vector_length=1)
|
||||
kb.initialize(nlp.vocab)
|
||||
kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
|
||||
return kb
|
||||
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
|
||||
def create_kb(vocab):
|
||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||
kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
|
||||
return kb
|
||||
return create_kb
|
||||
|
||||
config = {"kb": {"@assets": "TestIssue5230KB.v1"}}
|
||||
config = {"kb_loader": {"@assets": "TestIssue5230KB.v1"}}
|
||||
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||
# need to add model for two reasons:
|
||||
# 1. no model leads to error in serialization,
|
||||
|
@ -121,19 +123,17 @@ def test_writer_with_path_py35():
|
|||
|
||||
def test_save_and_load_knowledge_base():
|
||||
nlp = Language()
|
||||
kb = KnowledgeBase(entity_vector_length=1)
|
||||
kb.initialize(nlp.vocab)
|
||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
with make_tempdir() as d:
|
||||
path = d / "kb"
|
||||
try:
|
||||
kb.dump(path)
|
||||
kb.to_disk(path)
|
||||
except Exception as e:
|
||||
pytest.fail(str(e))
|
||||
|
||||
try:
|
||||
kb_loaded = KnowledgeBase(entity_vector_length=1)
|
||||
kb_loaded.initialize(nlp.vocab)
|
||||
kb_loaded.load_bulk(path)
|
||||
kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
kb_loaded.from_disk(path)
|
||||
except Exception as e:
|
||||
pytest.fail(str(e))
|
||||
|
||||
|
|
|
@ -1,4 +1,8 @@
|
|||
from spacy.util import ensure_path
|
||||
from typing import Callable
|
||||
|
||||
from spacy import util
|
||||
from spacy.lang.en import English
|
||||
from spacy.util import ensure_path, registry
|
||||
from spacy.kb import KnowledgeBase
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
@ -15,20 +19,16 @@ def test_serialize_kb_disk(en_vocab):
|
|||
if not dir_path.exists():
|
||||
dir_path.mkdir()
|
||||
file_path = dir_path / "kb"
|
||||
kb1.dump(str(file_path))
|
||||
|
||||
kb2 = KnowledgeBase(entity_vector_length=3)
|
||||
kb2.initialize(en_vocab)
|
||||
kb2.load_bulk(str(file_path))
|
||||
kb1.to_disk(str(file_path))
|
||||
kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3)
|
||||
kb2.from_disk(str(file_path))
|
||||
|
||||
# final assertions
|
||||
_check_kb(kb2)
|
||||
|
||||
|
||||
def _get_dummy_kb(vocab):
|
||||
kb = KnowledgeBase(entity_vector_length=3)
|
||||
kb.initialize(vocab)
|
||||
|
||||
kb = KnowledgeBase(vocab, entity_vector_length=3)
|
||||
kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])
|
||||
kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])
|
||||
kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])
|
||||
|
@ -61,7 +61,7 @@ def _check_kb(kb):
|
|||
assert alias_string not in kb.get_alias_strings()
|
||||
|
||||
# check candidates & probabilities
|
||||
candidates = sorted(kb.get_candidates("double07"), key=lambda x: x.entity_)
|
||||
candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_)
|
||||
assert len(candidates) == 2
|
||||
|
||||
assert candidates[0].entity_ == "Q007"
|
||||
|
@ -75,3 +75,47 @@ def _check_kb(kb):
|
|||
assert candidates[1].entity_vector == [7, 1, 0]
|
||||
assert candidates[1].alias_ == "double07"
|
||||
assert 0.099 < candidates[1].prior_prob < 0.101
|
||||
|
||||
|
||||
def test_serialize_subclassed_kb():
|
||||
"""Check that IO of a custom KB works fine as part of an EL pipe."""
|
||||
|
||||
class SubKnowledgeBase(KnowledgeBase):
|
||||
def __init__(self, vocab, entity_vector_length, custom_field):
|
||||
super().__init__(vocab, entity_vector_length)
|
||||
self.custom_field = custom_field
|
||||
|
||||
@registry.assets.register("spacy.CustomKB.v1")
|
||||
def custom_kb(
|
||||
entity_vector_length: int, custom_field: int
|
||||
) -> Callable[["Vocab"], KnowledgeBase]:
|
||||
def custom_kb_factory(vocab):
|
||||
return SubKnowledgeBase(
|
||||
vocab=vocab,
|
||||
entity_vector_length=entity_vector_length,
|
||||
custom_field=custom_field,
|
||||
)
|
||||
|
||||
return custom_kb_factory
|
||||
|
||||
nlp = English()
|
||||
config = {
|
||||
"kb_loader": {
|
||||
"@assets": "spacy.CustomKB.v1",
|
||||
"entity_vector_length": 342,
|
||||
"custom_field": 666,
|
||||
}
|
||||
}
|
||||
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||
assert type(entity_linker.kb) == SubKnowledgeBase
|
||||
assert entity_linker.kb.entity_vector_length == 342
|
||||
assert entity_linker.kb.custom_field == 666
|
||||
|
||||
# Make sure the custom KB is serialized correctly
|
||||
with make_tempdir() as tmp_dir:
|
||||
nlp.to_disk(tmp_dir)
|
||||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
entity_linker2 = nlp2.get_pipe("entity_linker")
|
||||
assert type(entity_linker2.kb) == SubKnowledgeBase
|
||||
assert entity_linker2.kb.entity_vector_length == 342
|
||||
assert entity_linker2.kb.custom_field == 666
|
||||
|
|
|
@ -33,7 +33,7 @@ TODO: intro and how architectures work, link to
|
|||
> subword_features = true
|
||||
> ```
|
||||
|
||||
Build spaCy's "standard" tok2vec layer, which uses hash embedding with subword
|
||||
Build spaCy's "standard" embedding layer, which uses hash embedding with subword
|
||||
features and a CNN with layer-normalized maxout.
|
||||
|
||||
| Name | Description |
|
||||
|
@ -45,6 +45,7 @@ features and a CNN with layer-normalized maxout.
|
|||
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
|
||||
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
|
||||
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
### spacy.Tok2Vec.v1 {#Tok2Vec}
|
||||
|
||||
|
@ -67,10 +68,11 @@ Construct a tok2vec model out of embedding and encoding subnetworks. See the
|
|||
["Embed, Encode, Attend, Predict"](https://explosion.ai/blog/deep-learning-formula-nlp)
|
||||
blog post for background.
|
||||
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `embed` | Embed tokens into context-independent word vector representations. For example, [CharacterEmbed](/api/architectures#CharacterEmbed) or [MultiHashEmbed](/api/architectures#MultiHashEmbed). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). ~~Model[List[Floats2d], List[Floats2d]]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `embed` | Embed tokens into context-independent word vector representations. For example, [CharacterEmbed](/api/architectures#CharacterEmbed) or [MultiHashEmbed](/api/architectures#MultiHashEmbed). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). ~~Model[List[Floats2d], List[Floats2d]]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
### spacy.Tok2VecListener.v1 {#Tok2VecListener}
|
||||
|
||||
|
@ -108,10 +110,13 @@ Instead of defining its own `Tok2Vec` instance, a model architecture like
|
|||
[Tagger](/api/architectures#tagger) can define a listener as its `tok2vec`
|
||||
argument that connects to the shared `tok2vec` component in the pipeline.
|
||||
|
||||
| Name | Description |
|
||||
| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~ |
|
||||
| `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
|
||||
<!-- TODO: return type -->
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~ |
|
||||
| `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model~~ |
|
||||
|
||||
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}
|
||||
|
||||
|
@ -134,12 +139,15 @@ definitions depending on the `Vocab` of the `Doc` object passed in. Vectors from
|
|||
pretrained static vectors can also be incorporated into the concatenated
|
||||
representation.
|
||||
|
||||
<!-- TODO: model return type -->
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~ |
|
||||
| `rows` | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. ~~int~~ |
|
||||
| `also_embed_subwords` | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. ~~bool~~ |
|
||||
| `also_use_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model~~ |
|
||||
|
||||
### spacy.CharacterEmbed.v1 {#CharacterEmbed}
|
||||
|
||||
|
@ -170,12 +178,15 @@ concatenated. A hash-embedded vector of the `NORM` of the word is also
|
|||
concatenated on, and the result is then passed through a feed-forward network to
|
||||
construct a single vector to represent the information.
|
||||
|
||||
| Name | Description |
|
||||
| ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The width of the output vector and the `NORM` hash embedding. ~~int~~ |
|
||||
| `rows` | The number of rows in the `NORM` hash embedding table. ~~int~~ |
|
||||
| `nM` | The dimensionality of the character embeddings. Recommended values are between `16` and `64`. ~~int~~ |
|
||||
| `nC` | The number of UTF-8 bytes to embed per word. Recommended values are between `3` and `8`, although it may depend on the length of words in the language. ~~int~~ |
|
||||
<!-- TODO: model return type -->
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The width of the output vector and the `NORM` hash embedding. ~~int~~ |
|
||||
| `rows` | The number of rows in the `NORM` hash embedding table. ~~int~~ |
|
||||
| `nM` | The dimensionality of the character embeddings. Recommended values are between `16` and `64`. ~~int~~ |
|
||||
| `nC` | The number of UTF-8 bytes to embed per word. Recommended values are between `3` and `8`, although it may depend on the length of words in the language. ~~int~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model~~ |
|
||||
|
||||
### spacy.MaxoutWindowEncoder.v1 {#MaxoutWindowEncoder}
|
||||
|
||||
|
@ -199,6 +210,7 @@ and residual connections.
|
|||
| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ |
|
||||
| `maxout_pieces` | The number of maxout pieces to use. Recommended values are `2` or `3`. ~~int~~ |
|
||||
| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~ |
|
||||
|
||||
### spacy.MishWindowEncoder.v1 {#MishWindowEncoder}
|
||||
|
||||
|
@ -221,6 +233,7 @@ and residual connections.
|
|||
| `width` | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ |
|
||||
| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ |
|
||||
| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~ |
|
||||
|
||||
### spacy.TorchBiLSTMEncoder.v1 {#TorchBiLSTMEncoder}
|
||||
|
||||
|
@ -242,10 +255,38 @@ Encode context using bidirectional LSTM layers. Requires
|
|||
| `width` | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ |
|
||||
| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ |
|
||||
| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~ |
|
||||
|
||||
### spacy.StaticVectors.v1 {#StaticVectors}
|
||||
|
||||
<!-- TODO: -->
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.StaticVectors.v1"
|
||||
> nO = null
|
||||
> nM = null
|
||||
> dropout = 0.2
|
||||
> key_attr = "ORTH"
|
||||
>
|
||||
> [model.init_W]
|
||||
> @initializers = "glorot_uniform_init.v1"
|
||||
> ```
|
||||
|
||||
Embed [`Doc`](/api/doc) objects with their vocab's vectors table, applying a
|
||||
learned linear projection to control the dimensionality. See the documentation
|
||||
on [static vectors](/usage/embeddings-transformers#static-vectors) for details.
|
||||
|
||||
<!-- TODO: document argument descriptions -->
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `nO` | Defaults to `None`. ~~Optional[int]~~ |
|
||||
| `nM` | Defaults to `None`. ~~Optional[int]~~ |
|
||||
| `dropout` | Optional dropout rate. If set, it's applied per dimension over the whole batch. Defaults to `None`. ~~Optional[float]~~ |
|
||||
| `init_W` | The [initialization function](https://thinc.ai/docs/api-initializers). Defaults to [`glorot_uniform_init`](https://thinc.ai/docs/api-initializers#glorot_uniform_init). ~~Callable[[Ops, Tuple[int, ...]]], FloatsXd]~~ |
|
||||
| `key_attr` | Defaults to `"ORTH"`. ~~str~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ |
|
||||
|
||||
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
|
||||
|
||||
|
@ -277,6 +318,7 @@ architectures into your training config.
|
|||
| `name` | Any model name that can be loaded by [`transformers.AutoModel`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoModel). ~~str~~ |
|
||||
| `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ |
|
||||
| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||
|
||||
### spacy-transformers.Tok2VecListener.v1 {#transformers-Tok2VecListener}
|
||||
|
||||
|
@ -305,6 +347,7 @@ a single token vector given zero or more wordpiece vectors.
|
|||
| ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ |
|
||||
| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer}
|
||||
|
||||
|
@ -330,6 +373,7 @@ one component.
|
|||
| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ |
|
||||
| `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ |
|
||||
| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
## Parser & NER architectures {#parser}
|
||||
|
||||
|
@ -372,6 +416,8 @@ consists of either two or three subnetworks:
|
|||
state representation. If not present, the output from the lower model is used
|
||||
as action scores directly.
|
||||
|
||||
<!-- TODO: model return type -->
|
||||
|
||||
| Name | Description |
|
||||
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
@ -380,6 +426,7 @@ consists of either two or three subnetworks:
|
|||
| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ |
|
||||
| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
|
||||
| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model~~ |
|
||||
|
||||
### spacy.BILUOTagger.v1 {#BILUOTagger source="spacy/ml/models/simple_ner.py"}
|
||||
|
||||
|
@ -406,9 +453,10 @@ generally results in better linear separation between classes, especially for
|
|||
non-CRF models, because there are more distinct classes for the different
|
||||
situations ([Ratinov et al., 2009](https://www.aclweb.org/anthology/W09-1119/)).
|
||||
|
||||
| Name | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------ |
|
||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------ |
|
||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
### spacy.IOBTagger.v1 {#IOBTagger source="spacy/ml/models/simple_ner.py"}
|
||||
|
||||
|
@ -431,9 +479,10 @@ spans into tags assigned to each token. The first token of a span is given the
|
|||
tag B-LABEL, and subsequent tokens are given the tag I-LABEL. All other tokens
|
||||
are assigned the tag O.
|
||||
|
||||
| Name | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------ |
|
||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------ |
|
||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
|
||||
|
||||
|
@ -454,10 +503,11 @@ Build a tagger model, using a provided token-to-vector component. The tagger
|
|||
model simply adds a linear layer with softmax activation to predict scores given
|
||||
the token vectors.
|
||||
|
||||
| Name | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------ |
|
||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `nO` | The number of tags to output. Inferred from the data if `None`. ~~Optional[int]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------ |
|
||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `nO` | The number of tags to output. Inferred from the data if `None`. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"}
|
||||
|
||||
|
@ -474,9 +524,6 @@ specific data and challenge.
|
|||
|
||||
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble}
|
||||
|
||||
Stacked ensemble of a bag-of-words model and a neural network model. The neural
|
||||
network has an internal CNN Tok2Vec layer and uses attention.
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
|
@ -493,17 +540,23 @@ network has an internal CNN Tok2Vec layer and uses attention.
|
|||
> nO = null
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ |
|
||||
| `width` | Output dimension of the feature encoding step. ~~int~~ |
|
||||
| `embed_size` | Input dimension of the feature encoding step. ~~int~~ |
|
||||
| `conv_depth` | Depth of the tok2vec layer. ~~int~~ |
|
||||
| `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ |
|
||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
|
||||
| `dropout` | The dropout rate. ~~float~~ |
|
||||
Stacked ensemble of a bag-of-words model and a neural network model. The neural
|
||||
network has an internal CNN Tok2Vec layer and uses attention.
|
||||
|
||||
<!-- TODO: model return type -->
|
||||
|
||||
| Name | Description |
|
||||
| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ |
|
||||
| `width` | Output dimension of the feature encoding step. ~~int~~ |
|
||||
| `embed_size` | Input dimension of the feature encoding step. ~~int~~ |
|
||||
| `conv_depth` | Depth of the tok2vec layer. ~~int~~ |
|
||||
| `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ |
|
||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
|
||||
| `dropout` | The dropout rate. ~~float~~ |
|
||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model~~ |
|
||||
|
||||
### spacy.TextCatCNN.v1 {#TextCatCNN}
|
||||
|
||||
|
@ -530,11 +583,14 @@ A neural network model where token vectors are calculated using a CNN. The
|
|||
vectors are mean pooled and used as features in a feed-forward network. This
|
||||
architecture is usually less accurate than the ensemble, but runs faster.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
|
||||
<!-- TODO: model return type -->
|
||||
|
||||
| Name | Description |
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
|
||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model~~ |
|
||||
|
||||
### spacy.TextCatBOW.v1 {#TextCatBOW}
|
||||
|
||||
|
@ -552,12 +608,15 @@ architecture is usually less accurate than the ensemble, but runs faster.
|
|||
An ngram "bag-of-words" model. This architecture should run much faster than the
|
||||
others, but may not be as accurate, especially if texts are short.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
|
||||
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ |
|
||||
<!-- TODO: model return type -->
|
||||
|
||||
| Name | Description |
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
|
||||
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ |
|
||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model~~ |
|
||||
|
||||
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
|
||||
|
||||
|
@ -574,9 +633,6 @@ into the "real world". This requires 3 main component
|
|||
|
||||
### spacy.EntityLinker.v1 {#EntityLinker}
|
||||
|
||||
The `EntityLinker` model architecture is a Thinc `Model` with a
|
||||
[`Linear`](https://thinc.ai/api-layers#linear) output layer.
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
|
@ -602,10 +658,16 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
|
|||
> @assets = "spacy.CandidateGenerator.v1"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
|
||||
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ |
|
||||
The `EntityLinker` model architecture is a Thinc `Model` with a
|
||||
[`Linear`](https://thinc.ai/api-layers#linear) output layer.
|
||||
|
||||
<!-- TODO: model return type -->
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
|
||||
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model~~ |
|
||||
|
||||
### spacy.EmptyKB.v1 {#EmptyKB}
|
||||
|
||||
|
|
|
@ -719,11 +719,11 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc]
|
|||
|
||||
Generate an installable
|
||||
[model Python package](/usage/training#models-generating) from an existing model
|
||||
data directory. All data files are copied over. If the path to a `meta.json` is
|
||||
supplied, or a `meta.json` is found in the input directory, this file is used.
|
||||
Otherwise, the data can be entered directly from the command line. spaCy will
|
||||
then create a `.tar.gz` archive file that you can distribute and install with
|
||||
`pip install`.
|
||||
data directory. All data files are copied over. If the path to a
|
||||
[`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is found in
|
||||
the input directory, this file is used. Otherwise, the data can be entered
|
||||
directly from the command line. spaCy will then create a `.tar.gz` archive file
|
||||
that you can distribute and install with `pip install`.
|
||||
|
||||
<Infobox title="New in v3.0" variant="warning">
|
||||
|
||||
|
@ -750,7 +750,7 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
|
|||
| ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `input_dir` | Path to directory containing model data. ~~Path (positional)~~ |
|
||||
| `output_dir` | Directory to create package folder in. ~~Path (positional)~~ |
|
||||
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | Path to `meta.json` file (optional). ~~Optional[Path] \(option)~~ |
|
||||
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | Path to [`meta.json`](/api/data-formats#meta) file (optional). ~~Optional[Path] \(option)~~ |
|
||||
| `--create-meta`, `-C` <Tag variant="new">2</Tag> | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ |
|
||||
| `--no-sdist`, `-NS`, | Don't build the `.tar.gz` sdist automatically. Can be set if you want to run this step manually. ~~bool (flag)~~ |
|
||||
| `--version`, `-v` <Tag variant="new">3</Tag> | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. ~~Optional[str] \(option)~~ |
|
||||
|
|
|
@ -6,6 +6,7 @@ menu:
|
|||
- ['Training Data', 'training']
|
||||
- ['Pretraining Data', 'pretraining']
|
||||
- ['Vocabulary', 'vocab']
|
||||
- ['Model Meta', 'meta']
|
||||
---
|
||||
|
||||
This section documents input and output formats of data used by spaCy, including
|
||||
|
@ -73,15 +74,15 @@ your config and check that it's valid, you can run the
|
|||
Defines the `nlp` object, its tokenizer and
|
||||
[processing pipeline](/usage/processing-pipelines) component names.
|
||||
|
||||
| Name | Description | Default |
|
||||
| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------- |
|
||||
| `lang` | The language code to use. ~~str~~ | `null` |
|
||||
| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). ~~List[str]~~ | `[]` |
|
||||
| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. ~~bool~~ | `true` |
|
||||
| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ | `null` |
|
||||
| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. ~~Optional[Callable[[Language], Language]]~~ | `null` |
|
||||
| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. ~~Optional[Callable[[Language], Language]]~~ | `null` |
|
||||
| `tokenizer` | The tokenizer to use. ~~Callable[[str], Doc]~~ | [`Tokenizer`](/api/tokenizer) |
|
||||
| Name | Description |
|
||||
| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ |
|
||||
| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ |
|
||||
| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ |
|
||||
| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ |
|
||||
| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||
| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||
| `tokenizer` | The tokenizer to use. Defaults to [`Tokenizer`](/api/tokenizer). ~~Callable[[str], Doc]~~ |
|
||||
|
||||
### components {#config-components tag="section"}
|
||||
|
||||
|
@ -128,24 +129,24 @@ process that are used when you run [`spacy train`](/api/cli#train).
|
|||
|
||||
<!-- TODO: complete -->
|
||||
|
||||
| Name | Description | Default |
|
||||
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------- |
|
||||
| `seed` | The random seed. ~~int~~ | `${system:seed}` |
|
||||
| `dropout` | The dropout rate. ~~float~~ | `0.1` |
|
||||
| `accumulate_gradient` | Whether to divide the batch up into substeps. ~~int~~ | `1` |
|
||||
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). ~~Optional[str]~~ | `${paths:init_tok2vec}` |
|
||||
| `raw_text` | ~~Optional[str]~~ | `${paths:raw}` |
|
||||
| `vectors` | ~~Optional[str]~~ | `null` |
|
||||
| `patience` | How many steps to continue without improvement in evaluation score. ~~int~~ | `1600` |
|
||||
| `max_epochs` | Maximum number of epochs to train for. ~~int~~ | `0` |
|
||||
| `max_steps` | Maximum number of update steps to train for. ~~int~~ | `20000` |
|
||||
| `eval_frequency` | How often to evaluate during training (steps). ~~int~~ | `200` |
|
||||
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. ~~Dict[str, float]~~ | `{}` |
|
||||
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. ~~List[str]~~ | `[]` |
|
||||
| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. ~~Callable[[Language], Iterator[Example]]~~ | [`Corpus`](/api/corpus) |
|
||||
| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. ~~Callable[[Language], Iterator[Example]]~~ | [`Corpus`](/api/corpus) |
|
||||
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | [`batch_by_words`](/api/top-level#batch_by_words) |
|
||||
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. ~~Optimizer~~ | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) |
|
||||
| Name | Description |
|
||||
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `seed` | The random seed. Defaults to variable `${system:seed}`. ~~int~~ |
|
||||
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
||||
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
||||
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths:init_tok2vec}`. ~~Optional[str]~~ |
|
||||
| `raw_text` | TODO: ... Defaults to variable `${paths:raw}`. ~~Optional[str]~~ |
|
||||
| `vectors` | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~ |
|
||||
| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ |
|
||||
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
|
||||
| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ |
|
||||
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
||||
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
||||
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
||||
| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ |
|
||||
| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ |
|
||||
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
||||
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
||||
|
||||
### pretraining {#config-pretraining tag="section,optional"}
|
||||
|
||||
|
@ -153,19 +154,19 @@ This section is optional and defines settings and controls for
|
|||
[language model pretraining](/usage/training#pretraining). It's used when you
|
||||
run [`spacy pretrain`](/api/cli#pretrain).
|
||||
|
||||
| Name | Description | Default |
|
||||
| ---------------------------- | ----------------------------------------------------------------------------------------------------------- | --------------------------------------------------- |
|
||||
| `max_epochs` | Maximum number of epochs. ~~int~~ | `1000` |
|
||||
| `min_length` | Minimum length of examples. ~~int~~ | `5` |
|
||||
| `max_length` | Maximum length of examples. ~~int~~ | `500` |
|
||||
| `dropout` | The dropout rate. ~~float~~ | `0.2` |
|
||||
| `n_save_every` | Saving frequency. ~~int~~ | `null` |
|
||||
| `batch_size` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). ~~Union[int, Sequence[int]]~~ | `3000` |
|
||||
| `seed` | The random seed. ~~int~~ | `${system:seed}` |
|
||||
| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. ~~bool~~ | `${system:use_pytorch_for_gpu_memory}` |
|
||||
| `tok2vec_model` | tok2vec model section in the config. ~~str~~ | `"components.tok2vec.model"` |
|
||||
| `objective` | The pretraining objective. ~~Dict[str, Any]~~ | `{"type": "characters", "n_characters": 4}` |
|
||||
| `optimizer` | The optimizer. ~~Optimizer~~ | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) |
|
||||
| Name | Description |
|
||||
| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ |
|
||||
| `min_length` | Minimum length of examples. Defaults to `5`. ~~int~~ |
|
||||
| `max_length` | Maximum length of examples. Defaults to `500`. ~~int~~ |
|
||||
| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ |
|
||||
| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ |
|
||||
| `batch_size` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). Defaults to `3000`. ~~Union[int, Sequence[int]]~~ |
|
||||
| `seed` | The random seed. Defaults to variable `${system:seed}`. ~~int~~ |
|
||||
| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system:use_pytorch_for_gpu_memory}`. ~~bool~~ |
|
||||
| `tok2vec_model` | The model section of the embedding component in the config. Defaults to `"components.tok2vec.model"`. ~~str~~ |
|
||||
| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
|
||||
| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
||||
|
||||
## Training data {#training}
|
||||
|
||||
|
@ -372,11 +373,11 @@ example = Example.from_dict(doc, gold_dict)
|
|||
|
||||
## Pretraining data {#pretraining}
|
||||
|
||||
The [`spacy pretrain`](/api/cli#pretrain) command lets you pretrain the tok2vec
|
||||
layer of pipeline components from raw text. Raw text can be provided as a
|
||||
`.jsonl` (newline-delimited JSON) file containing one input text per line
|
||||
(roughly paragraph length is good). Optionally, custom tokenization can be
|
||||
provided.
|
||||
The [`spacy pretrain`](/api/cli#pretrain) command lets you pretrain the
|
||||
"token-to-vector" embedding layer of pipeline components from raw text. Raw text
|
||||
can be provided as a `.jsonl` (newline-delimited JSON) file containing one input
|
||||
text per line (roughly paragraph length is good). Optionally, custom
|
||||
tokenization can be provided.
|
||||
|
||||
> #### Tip: Writing JSONL
|
||||
>
|
||||
|
@ -457,3 +458,75 @@ Here's an example of the 20 most frequent lexemes in the English training data:
|
|||
```json
|
||||
https://github.com/explosion/spaCy/tree/master/examples/training/vocab-data.jsonl
|
||||
```
|
||||
|
||||
## Model meta {#meta}
|
||||
|
||||
The model meta is available as the file `meta.json` and exported automatically
|
||||
when you save an `nlp` object to disk. Its contents are available as
|
||||
[`nlp.meta`](/api/language#meta).
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0">
|
||||
|
||||
As of spaCy v3.0, the `meta.json` **isn't** used to construct the language class
|
||||
and pipeline anymore and only contains meta information for reference and for
|
||||
creating a Python package with [`spacy package`](/api/cli#package). How to set
|
||||
up the `nlp` object is now defined in the
|
||||
[`config.cfg`](/api/data-formats#config), which includes detailed information
|
||||
about the pipeline components and their model architectures, and all other
|
||||
settings and hyperparameters used to train the model. It's the **single source
|
||||
of truth** used for loading a model.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```json
|
||||
> {
|
||||
> "name": "example_model",
|
||||
> "lang": "en",
|
||||
> "version": "1.0.0",
|
||||
> "spacy_version": ">=3.0.0,<3.1.0",
|
||||
> "parent_package": "spacy",
|
||||
> "description": "Example model for spaCy",
|
||||
> "author": "You",
|
||||
> "email": "you@example.com",
|
||||
> "url": "https://example.com",
|
||||
> "license": "CC BY-SA 3.0",
|
||||
> "sources": [{ "name": "My Corpus", "license": "MIT" }],
|
||||
> "vectors": { "width": 0, "vectors": 0, "keys": 0, "name": null },
|
||||
> "pipeline": ["tok2vec", "ner", "textcat"],
|
||||
> "labels": {
|
||||
> "ner": ["PERSON", "ORG", "PRODUCT"],
|
||||
> "textcat": ["POSITIVE", "NEGATIVE"]
|
||||
> },
|
||||
> "accuracy": {
|
||||
> "ents_f": 82.7300930714,
|
||||
> "ents_p": 82.135523614,
|
||||
> "ents_r": 83.3333333333,
|
||||
> "textcat_score": 88.364323811
|
||||
> },
|
||||
> "speed": { "cpu": 7667.8, "gpu": null, "nwords": 10329 },
|
||||
> "spacy_git_version": "61dfdd9fb"
|
||||
> }
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `"en"`. ~~str~~ |
|
||||
| `name` | Model name, e.g. `"core_web_sm"`. The final model package name will be `{lang}_{name}`. Defaults to `"model"`. ~~str~~ |
|
||||
| `version` | Model version. Will be used to version a Python package created with [`spacy package`](/api/cli#package). Defaults to `"0.0.0"`. ~~str~~ |
|
||||
| `spacy_version` | spaCy version range the model is compatible with. Defaults to spaCy version used to create the model, up to next minor version, which is the default compatibility for the available [pretrained models](/models). For instance, a model trained with v3.0.0 will have the version range `">=3.0.0,<3.1.0"`. ~~str~~ |
|
||||
| `parent_package` | Name of the spaCy package. Typically `"spacy"` or `"spacy_nightly"`. Defaults to `"spacy"`. ~~str~~ |
|
||||
| `description` | Model description. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
| `author` | Model author name. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
| `email` | Model author email. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
| `url` | Model author URL. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
| `license` | Model license. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
| `sources` | Data sources used to train the model. Typically a list of dicts with the keys `"name"`, `"url"`, `"author"` and `"license"`. [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `None`. ~~Optional[List[Dict[str, str]]]~~ |
|
||||
| `vectors` | Information about the word vectors included with the model. Typically a dict with the keys `"width"`, `"vectors"` (number of vectors), `"keys"` and `"name"`. ~~Dict[str, Any]~~ |
|
||||
| `pipeline` | Names of pipeline component names in the model, in order. Corresponds to [`nlp.pipe_names`](/api/language#pipe_names). Only exists for reference and is not used to create the components. This information is defined in the [`config.cfg`](/api/data-formats#config). Defaults to `[]`. ~~List[str]~~ |
|
||||
| `labels` | Label schemes of the trained pipeline components, keyed by component name. Corresponds to [`nlp.pipe_labels`](/api/language#pipe_labels). [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `{}`. ~~Dict[str, Dict[str, List[str]]]~~ |
|
||||
| `accuracy` | Training accuracy, added automatically by [`spacy train`](/api/cli#train). Dictionary of [score names](/usage/training#metrics) mapped to scores. Defaults to `{}`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
| `speed` | Model speed, added automatically by [`spacy train`](/api/cli#train). Typically a dictionary with the keys `"cpu"`, `"gpu"` and `"nwords"` (words per second). Defaults to `{}`. ~~Dict[str, Optional[Union[float, str]]]~~ |
|
||||
| `spacy_git_version` <Tag variant="new">3</Tag> | Git commit of [`spacy`](https://github.com/explosion/spaCy) used to create model. ~~str~~ |
|
||||
| other | Any other custom meta information you want to add. The data is preserved in [`nlp.meta`](/api/language#meta). ~~Any~~ |
|
||||
|
|
|
@ -200,21 +200,21 @@ probability of the fact that the mention links to the entity ID.
|
|||
| `alias` | The textual mention or alias. ~~str~~ |
|
||||
| **RETURNS** | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
|
||||
|
||||
## KnowledgeBase.dump {#dump tag="method"}
|
||||
## KnowledgeBase.to_disk {#to_disk tag="method"}
|
||||
|
||||
Save the current state of the knowledge base to a directory.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> kb.dump(loc)
|
||||
> kb.to_disk(loc)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `loc` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
|
||||
## KnowledgeBase.load_bulk {#load_bulk tag="method"}
|
||||
## KnowledgeBase.from_disk {#from_disk tag="method"}
|
||||
|
||||
Restore the state of the knowledge base from a given directory. Note that the
|
||||
[`Vocab`](/api/vocab) should also be the same as the one used to create the KB.
|
||||
|
@ -226,7 +226,7 @@ Restore the state of the knowledge base from a given directory. Note that the
|
|||
> from spacy.vocab import Vocab
|
||||
> vocab = Vocab().from_disk("/path/to/vocab")
|
||||
> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
|
||||
> kb.load_bulk("/path/to/kb")
|
||||
> kb.from_disk("/path/to/kb")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
|
|
@ -742,7 +742,7 @@ token.ent_iob, token.ent_type
|
|||
|
||||
Custom meta data for the Language class. If a model is loaded, contains meta
|
||||
data of the model. The `Language.meta` is also what's serialized as the
|
||||
`meta.json` when you save an `nlp` object to disk.
|
||||
[`meta.json`](/api/data-formats#meta) when you save an `nlp` object to disk.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -954,12 +954,12 @@ serialization by passing in the string names via the `exclude` argument.
|
|||
> nlp.from_disk("./model-data", exclude=["ner"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------- |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `tokenizer` | Tokenization rules and exceptions. |
|
||||
| `meta` | The meta data, available as `Language.meta`. |
|
||||
| ... | String names of pipeline components, e.g. `"ner"`. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------ |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `tokenizer` | Tokenization rules and exceptions. |
|
||||
| `meta` | The meta data, available as [`Language.meta`](/api/language#meta). |
|
||||
| ... | String names of pipeline components, e.g. `"ner"`. |
|
||||
|
||||
## FactoryMeta {#factorymeta new="3" tag="dataclass"}
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@ multiple components, e.g. to have one embedding and CNN network shared between a
|
|||
[`EntityRecognizer`](/api/entityrecognizer).
|
||||
|
||||
In order to use the `Tok2Vec` predictions, subsequent components should use the
|
||||
[Tok2VecListener](/api/architectures#Tok2VecListener) layer as the tok2vec
|
||||
[Tok2VecListener](/api/architectures#Tok2VecListener) layer as the `tok2vec`
|
||||
subnetwork of their model. This layer will read data from the `doc.tensor`
|
||||
attribute during prediction. During training, the `Tok2Vec` component will save
|
||||
its prediction and backprop callback for each batch, so that the subsequent
|
||||
|
|
|
@ -18,9 +18,10 @@ Load a model using the name of an installed
|
|||
`Path`-like object. spaCy will try resolving the load argument in this order. If
|
||||
a model is loaded from a model name, spaCy will assume it's a Python package and
|
||||
import it and call the model's own `load()` method. If a model is loaded from a
|
||||
path, spaCy will assume it's a data directory, read the language and pipeline
|
||||
settings off the meta.json and initialize the `Language` class. The data will be
|
||||
loaded in via [`Language.from_disk`](/api/language#from_disk).
|
||||
path, spaCy will assume it's a data directory, load its
|
||||
[`config.cfg`](/api/data-formats#config) and use the language and pipeline
|
||||
information to construct the `Language` class. The data will be loaded in via
|
||||
[`Language.from_disk`](/api/language#from_disk).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -40,9 +41,10 @@ loaded in via [`Language.from_disk`](/api/language#from_disk).
|
|||
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | A `Language` object with the loaded model. ~~Language~~ |
|
||||
|
||||
Essentially, `spacy.load()` is a convenience wrapper that reads the language ID
|
||||
and pipeline components from a model's `meta.json`, initializes the `Language`
|
||||
class, loads in the model data and returns it.
|
||||
Essentially, `spacy.load()` is a convenience wrapper that reads the model's
|
||||
[`config.cfg`](/api/data-formats#config), uses the language and pipeline
|
||||
information to construct a `Language` object, loads in the model data and
|
||||
returns it.
|
||||
|
||||
```python
|
||||
### Abstract example
|
||||
|
@ -543,8 +545,8 @@ loaded lazily, to avoid expensive setup code associated with the language data.
|
|||
Load a model from a package or data path. If called with a package name, spaCy
|
||||
will assume the model is a Python package and import and call its `load()`
|
||||
method. If called with a path, spaCy will assume it's a data directory, read the
|
||||
language and pipeline settings from the meta.json and initialize a `Language`
|
||||
class. The model data will then be loaded in via
|
||||
language and pipeline settings from the [`config.cfg`](/api/data-formats#config)
|
||||
and create a `Language` object. The model data will then be loaded in via
|
||||
[`Language.from_disk`](/api/language#from_disk).
|
||||
|
||||
> #### Example
|
||||
|
@ -607,7 +609,8 @@ components are created, as well as all training settings and hyperparameters.
|
|||
|
||||
### util.load_meta {#util.load_meta tag="function" new="3"}
|
||||
|
||||
Get a model's `meta.json` from a file path and validate its contents.
|
||||
Get a model's [`meta.json`](/api/data-formats#meta) from a file path and
|
||||
validate its contents.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
41
website/docs/images/tok2vec-listener.svg
Normal file
41
website/docs/images/tok2vec-listener.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 17 KiB |
17
website/docs/images/tok2vec.svg
Normal file
17
website/docs/images/tok2vec.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 18 KiB |
|
@ -9,11 +9,7 @@ menu:
|
|||
next: /usage/training
|
||||
---
|
||||
|
||||
<!-- TODO: intro, short explanation of embeddings/transformers, point user to processing pipelines docs for intro -->
|
||||
|
||||
## Shared embedding layers {#embedding-layers}
|
||||
|
||||
<!-- TODO: write: `Tok2Vec` and `Transformer` components -->
|
||||
<!-- TODO: intro, short explanation of embeddings/transformers, Tok2Vec and Transformer components, point user to processing pipelines docs for more general info that user should know first -->
|
||||
|
||||
<Accordion title="What’s the difference between word vectors and language models?" id="vectors-vs-language-models">
|
||||
|
||||
|
@ -55,6 +51,22 @@ of performance.
|
|||
|
||||
</Accordion>
|
||||
|
||||
## Shared embedding layers {#embedding-layers}
|
||||
|
||||
<!-- TODO: write -->
|
||||
|
||||
![Pipeline components using a shared embedding component vs. independent embedding layers](../images/tok2vec.svg)
|
||||
|
||||
| Shared | Independent |
|
||||
| ------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------- |
|
||||
| ✅ **smaller:** models only need to include a single copy of the embeddings | ❌ **larger:** models need to include the embeddings for each component |
|
||||
| ✅ **faster:** | ❌ **slower:** |
|
||||
| ❌ **less composable:** all components require the same embedding component in the pipeline | ✅ **modular:** components can be moved and swapped freely |
|
||||
|
||||
![Pipeline components listening to shared embedding component](../images/tok2vec-listener.svg)
|
||||
|
||||
<!-- TODO: explain the listener concept, how it works etc. -->
|
||||
|
||||
## Using transformer models {#transformers}
|
||||
|
||||
Transformers are a family of neural network architectures that compute **dense,
|
||||
|
@ -295,18 +307,6 @@ is, a Thinc model that takes a list of [`Doc`](/api/doc) objects, and returns a
|
|||
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) object with the
|
||||
transformer data.
|
||||
|
||||
> #### Model type annotations
|
||||
>
|
||||
> In the documentation and code base, you may come across type annotations and
|
||||
> descriptions of [Thinc](https://thinc.ai) model types, like ~~Model[List[Doc],
|
||||
> List[Floats2d]]~~. This so-called generic type describes the layer and its
|
||||
> input and output type – in this case, it takes a list of `Doc` objects as the
|
||||
> input and list of 2-dimensional arrays of floats as the output. You can read
|
||||
> more about defining Thinc models [here](https://thinc.ai/docs/usage-models).
|
||||
> Also see the [type checking](https://thinc.ai/docs/usage-type-checking) for
|
||||
> how to enable linting in your editor to see live feedback if your inputs and
|
||||
> outputs don't match.
|
||||
|
||||
The same idea applies to task models that power the **downstream components**.
|
||||
Most of spaCy's built-in model creation functions support a `tok2vec` argument,
|
||||
which should be a Thinc layer of type ~~Model[List[Doc], List[Floats2d]]~~. This
|
||||
|
|
|
@ -70,8 +70,7 @@ import Languages from 'widgets/languages.js'
|
|||
> nlp = MultiLanguage()
|
||||
>
|
||||
> # With lazy-loading
|
||||
> from spacy.util import get_lang_class
|
||||
> nlp = get_lang_class('xx')
|
||||
> nlp = spacy.blank("xx")
|
||||
> ```
|
||||
|
||||
spaCy also supports models trained on more than one language. This is especially
|
||||
|
@ -80,10 +79,10 @@ language-neutral models is `xx`. The language class, a generic subclass
|
|||
containing only the base language data, can be found in
|
||||
[`lang/xx`](https://github.com/explosion/spaCy/tree/master/spacy/lang/xx).
|
||||
|
||||
To load your model with the neutral, multi-language class, simply set
|
||||
`"language": "xx"` in your [model package](/usage/training#models-generating)'s
|
||||
`meta.json`. You can also import the class directly, or call
|
||||
[`util.get_lang_class()`](/api/top-level#util.get_lang_class) for lazy-loading.
|
||||
To train a model using the neutral multi-language class, you can set
|
||||
`lang = "xx"` in your [training config](/usage/training#config). You can also
|
||||
import the `MultiLanguage` class directly, or call
|
||||
[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading.
|
||||
|
||||
### Chinese language support {#chinese new=2.3}
|
||||
|
||||
|
@ -308,12 +307,14 @@ model data.
|
|||
```yaml
|
||||
### Directory structure {highlight="7"}
|
||||
└── en_core_web_md-3.0.0.tar.gz # downloaded archive
|
||||
├── meta.json # model meta data
|
||||
├── setup.py # setup file for pip installation
|
||||
├── meta.json # copy of model meta
|
||||
└── en_core_web_md # 📦 model package
|
||||
├── __init__.py # init for pip installation
|
||||
├── meta.json # model meta data
|
||||
└── en_core_web_md-3.0.0 # model data
|
||||
├── config.cfg # model config
|
||||
├── meta.json # model meta
|
||||
└── ... # directories with component data
|
||||
```
|
||||
|
||||
You can place the **model package directory** anywhere on your local file
|
||||
|
|
|
@ -232,7 +232,7 @@ available pipeline components and component functions.
|
|||
| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. |
|
||||
| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. |
|
||||
| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. |
|
||||
| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | |
|
||||
| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | Assign token-to-vector embeddings. |
|
||||
| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. |
|
||||
|
||||
### Disabling and modifying pipeline components {#disabling}
|
||||
|
|
|
@ -1096,11 +1096,12 @@ ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
|||
nlp.to_disk("/path/to/model")
|
||||
```
|
||||
|
||||
The saved model now includes the `"entity_ruler"` in its `"pipeline"` setting in
|
||||
the `meta.json`, and the model directory contains a file `entityruler.jsonl`
|
||||
with the patterns. When you load the model back in, all pipeline components will
|
||||
be restored and deserialized – including the entity ruler. This lets you ship
|
||||
powerful model packages with binary weights _and_ rules included!
|
||||
The saved model now includes the `"entity_ruler"` in its
|
||||
[`config.cfg`](/api/data-formats#config) and the model directory contains a file
|
||||
`entityruler.jsonl` with the patterns. When you load the model back in, all
|
||||
pipeline components will be restored and deserialized – including the entity
|
||||
ruler. This lets you ship powerful model packages with binary weights _and_
|
||||
rules included!
|
||||
|
||||
### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}
|
||||
|
||||
|
|
|
@ -569,9 +569,32 @@ back later. You can do this with the
|
|||
nlp.to_disk('/home/me/data/en_example_model')
|
||||
```
|
||||
|
||||
The directory will be created if it doesn't exist, and the whole pipeline will
|
||||
be written out. To make the model more convenient to deploy, we recommend
|
||||
wrapping it as a Python package.
|
||||
The directory will be created if it doesn't exist, and the whole pipeline data,
|
||||
model meta and model configuration will be written out. To make the model more
|
||||
convenient to deploy, we recommend wrapping it as a
|
||||
[Python package](/api/cli#package).
|
||||
|
||||
<Accordion title="What’s the difference between the config.cfg and meta.json?" spaced id="models-meta-vs-config">
|
||||
|
||||
When you save a model in spaCy v3.0+, two files will be exported: a
|
||||
[`config.cfg`](/api/data-formats#config) based on
|
||||
[`nlp.config`](/api/language#config) and a [`meta.json`](/api/data-formats#meta)
|
||||
based on [`nlp.meta`](/api/language#meta).
|
||||
|
||||
- **config**: Configuration used to create the current `nlp` object, its
|
||||
pipeline components and models, as well as training settings and
|
||||
hyperparameters. Can include references to registered functions like
|
||||
[pipeline components](/usage/processing-pipelines#custom-components) or
|
||||
[model architectures](/api/architectures). Given a config, spaCy is able
|
||||
reconstruct the whole tree of objects and the `nlp` object. An exported config
|
||||
can also be used to [train a model](/usage/training#conig) with the same
|
||||
settings.
|
||||
- **meta**: Meta information about the model and the Python package, such as the
|
||||
author information, license, version, data sources and label scheme. This is
|
||||
mostly used for documentation purposes and for packaging models. It has no
|
||||
impact on the functionality of the `nlp` object.
|
||||
|
||||
</Accordion>
|
||||
|
||||
### Generating a model package {#models-generating}
|
||||
|
||||
|
@ -623,6 +646,9 @@ model package that can be installed using `pip install`.
|
|||
├── en_example_model # model directory
|
||||
│ ├── __init__.py # init for pip installation
|
||||
│ └── en_example_model-1.0.0 # model data
|
||||
│ ├── config.cfg # model config
|
||||
│ ├── meta.json # model meta
|
||||
│ └── ... # directories with component data
|
||||
└── dist
|
||||
└── en_example_model-1.0.0.tar.gz # installable package
|
||||
```
|
||||
|
@ -644,13 +670,25 @@ you can also **ship the code with your model** and include it in the
|
|||
[pipeline components](/usage/processing-pipelines#custom-components) before the
|
||||
`nlp` object is created.
|
||||
|
||||
<Infobox variant="warning" title="Important note on making manual edits">
|
||||
|
||||
While it's no problem to edit the package code or meta information, avoid making
|
||||
edits to the `config.cfg` **after** training, as this can easily lead to data
|
||||
incompatibility. For instance, changing an architecture or hyperparameter can
|
||||
mean that the trained weights are now incompatible. If you want to make
|
||||
adjustments, you can do so before training. Otherwise, you should always trust
|
||||
spaCy to export the current state of its `nlp` objects via
|
||||
[`nlp.config`](/api/language#config).
|
||||
|
||||
</Infobox>
|
||||
|
||||
### Loading a custom model package {#loading}
|
||||
|
||||
To load a model from a data directory, you can use
|
||||
[`spacy.load()`](/api/top-level#spacy.load) with the local path. This will look
|
||||
for a meta.json in the directory and use the `lang` and `pipeline` settings to
|
||||
initialize a `Language` class with a processing pipeline and load in the model
|
||||
data.
|
||||
for a `config.cfg` in the directory and use the `lang` and `pipeline` settings
|
||||
to initialize a `Language` class with a processing pipeline and load in the
|
||||
model data.
|
||||
|
||||
```python
|
||||
nlp = spacy.load("/path/to/model")
|
||||
|
|
|
@ -384,7 +384,40 @@ that reference this variable.
|
|||
|
||||
### Model architectures {#model-architectures}
|
||||
|
||||
<!-- TODO: refer to architectures API: /api/architectures -->
|
||||
> #### 💡 Model type annotations
|
||||
>
|
||||
> In the documentation and code base, you may come across type annotations and
|
||||
> descriptions of [Thinc](https://thinc.ai) model types, like ~~Model[List[Doc],
|
||||
> List[Floats2d]]~~. This so-called generic type describes the layer and its
|
||||
> input and output type – in this case, it takes a list of `Doc` objects as the
|
||||
> input and list of 2-dimensional arrays of floats as the output. You can read
|
||||
> more about defining Thinc models [here](https://thinc.ai/docs/usage-models).
|
||||
> Also see the [type checking](https://thinc.ai/docs/usage-type-checking) for
|
||||
> how to enable linting in your editor to see live feedback if your inputs and
|
||||
> outputs don't match.
|
||||
|
||||
A **model architecture** is a function that wires up a Thinc
|
||||
[`Model`](https://thinc.ai/docs/api-model) instance, which you can then use in a
|
||||
component or as a layer of a larger network. You can use Thinc as a thin
|
||||
[wrapper around frameworks](https://thinc.ai/docs/usage-frameworks) such as
|
||||
PyTorch, TensorFlow or MXNet, or you can implement your logic in Thinc
|
||||
[directly](https://thinc.ai/docs/usage-models).
|
||||
|
||||
spaCy's built-in components will never construct their `Model` instances
|
||||
themselves, so you won't have to subclass the component to change its model
|
||||
architecture. You can just **update the config** so that it refers to a
|
||||
different registered function. Once the component has been created, its `Model`
|
||||
instance has already been assigned, so you cannot change its model architecture.
|
||||
The architecture is like a recipe for the network, and you can't change the
|
||||
recipe once the dish has already been prepared. You have to make a new one.
|
||||
spaCy includes a variety of built-in [architectures](/api/architectures) for
|
||||
different tasks. For example:
|
||||
|
||||
<!-- TODO: -->
|
||||
|
||||
| Architecture | Description |
|
||||
| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| [HashEmbedCNN](/api/architectures#HashEmbedCNN) | Build spaCy’s “standard” embedding layer, which uses hash embedding with subword features and a CNN with layer-normalized maxout. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
### Metrics, training output and weighted scores {#metrics}
|
||||
|
||||
|
@ -433,14 +466,12 @@ components are weighted equally.
|
|||
| Name | Description |
|
||||
| -------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
|
||||
| **Loss** | The training loss representing the amount of work left for the optimizer. Should decrease, but usually not to `0`. |
|
||||
| **Precision** (P) | The percentage of generated predictions that are correct. Should increase. |
|
||||
| **Recall** (R) | The percentage of gold-standard annotations that are in fact predicted. Should increase. |
|
||||
| **F-Score** (F) | The weighted average of precision and recall. Should increase. |
|
||||
| **Precision** (P) | Percentage of predicted annotations that were correct. Should increase. |
|
||||
| **Recall** (R) | Percentage of reference annotations recovered. Should increase. |
|
||||
| **F-Score** (F) | Harmonic mean of precision and recall. Should increase. |
|
||||
| **UAS** / **LAS** | Unlabeled and labeled attachment score for the dependency parser, i.e. the percentage of correct arcs. Should increase. |
|
||||
| **Words per second** (WPS) | Prediction speed in words per second. Should stay stable. |
|
||||
|
||||
<!-- TODO: is this still relevant? -->
|
||||
|
||||
Note that if the development data has raw text, some of the gold-standard
|
||||
entities might not align to the predicted tokenization. These tokenization
|
||||
errors are **excluded from the NER evaluation**. If your tokenization makes it
|
||||
|
|
|
@ -1,305 +0,0 @@
|
|||
---
|
||||
title: Transformers
|
||||
teaser: Using transformer models like BERT in spaCy
|
||||
menu:
|
||||
- ['Installation', 'install']
|
||||
- ['Runtime Usage', 'runtime']
|
||||
- ['Training Usage', 'training']
|
||||
next: /usage/training
|
||||
---
|
||||
|
||||
## Installation {#install hidden="true"}
|
||||
|
||||
Transformers are a family of neural network architectures that compute **dense,
|
||||
context-sensitive representations** for the tokens in your documents. Downstream
|
||||
models in your pipeline can then use these representations as input features to
|
||||
**improve their predictions**. You can connect multiple components to a single
|
||||
transformer model, with any or all of those components giving feedback to the
|
||||
transformer to fine-tune it to your tasks. spaCy's transformer support
|
||||
interoperates with [PyTorch](https://pytorch.org) and the
|
||||
[HuggingFace `transformers`](https://huggingface.co/transformers/) library,
|
||||
giving you access to thousands of pretrained models for your pipelines. There
|
||||
are many [great guides](http://jalammar.github.io/illustrated-transformer/) to
|
||||
transformer models, but for practical purposes, you can simply think of them as
|
||||
a drop-in replacement that let you achieve **higher accuracy** in exchange for
|
||||
**higher training and runtime costs**.
|
||||
|
||||
### System requirements
|
||||
|
||||
We recommend an NVIDIA GPU with at least 10GB of memory in order to work with
|
||||
transformer models. The exact requirements will depend on the transformer you
|
||||
model you choose and whether you're training the pipeline or simply running it.
|
||||
Training a transformer-based model without a GPU will be too slow for most
|
||||
practical purposes. You'll also need to make sure your GPU drivers are
|
||||
up-to-date and v9+ of the CUDA runtime is installed.
|
||||
|
||||
Once you have CUDA installed, you'll need to install two pip packages,
|
||||
[`cupy`](https://docs.cupy.dev/en/stable/install.html) and
|
||||
[`spacy-transformers`](https://github.com/explosion/spacy-transformers). `cupy`
|
||||
is just like `numpy`, but for GPU. The best way to install it is to choose a
|
||||
wheel that matches the version of CUDA you're using. You may also need to set
|
||||
the `CUDA_PATH` environment variable if your CUDA runtime is installed in a
|
||||
non-standard location. Putting it all together, if you had installed CUDA 10.2
|
||||
in `/opt/nvidia/cuda`, you would run:
|
||||
|
||||
```bash
|
||||
### Installation with CUDA
|
||||
export CUDA_PATH="/opt/nvidia/cuda"
|
||||
pip install cupy-cuda102
|
||||
pip install spacy-transformers
|
||||
```
|
||||
|
||||
Provisioning a new machine will require about 5GB of data to be downloaded in
|
||||
total: 3GB for the CUDA runtime, 800MB for PyTorch, 400MB for CuPy, 500MB for
|
||||
the transformer weights, and about 200MB for spaCy and its various requirements.
|
||||
|
||||
## Runtime usage {#runtime}
|
||||
|
||||
Transformer models can be used as **drop-in replacements** for other types of
|
||||
neural networks, so your spaCy pipeline can include them in a way that's
|
||||
completely invisible to the user. Users will download, load and use the model in
|
||||
the standard way, like any other spaCy pipeline. Instead of using the
|
||||
transformers as subnetworks directly, you can also use them via the
|
||||
[`Transformer`](/api/transformer) pipeline component.
|
||||
|
||||
![The processing pipeline with the transformer component](../images/pipeline_transformer.svg)
|
||||
|
||||
The `Transformer` component sets the
|
||||
[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
|
||||
which lets you access the transformers outputs at runtime.
|
||||
|
||||
```bash
|
||||
$ python -m spacy download en_core_trf_lg
|
||||
```
|
||||
|
||||
```python
|
||||
### Example
|
||||
import spacy
|
||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
|
||||
|
||||
# Use the GPU, with memory allocations directed via PyTorch.
|
||||
# This prevents out-of-memory errors that would otherwise occur from competing
|
||||
# memory pools.
|
||||
use_pytorch_for_gpu_memory()
|
||||
require_gpu(0)
|
||||
|
||||
nlp = spacy.load("en_core_trf_lg")
|
||||
for doc in nlp.pipe(["some text", "some other text"]):
|
||||
tokvecs = doc._.trf_data.tensors[-1]
|
||||
```
|
||||
|
||||
You can also customize how the [`Transformer`](/api/transformer) component sets
|
||||
annotations onto the [`Doc`](/api/doc), by customizing the `annotation_setter`.
|
||||
This callback will be called with the raw input and output data for the whole
|
||||
batch, along with the batch of `Doc` objects, allowing you to implement whatever
|
||||
you need. The annotation setter is called with a batch of [`Doc`](/api/doc)
|
||||
objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch)
|
||||
containing the transformers data for the batch.
|
||||
|
||||
```python
|
||||
def custom_annotation_setter(docs, trf_data):
|
||||
# TODO:
|
||||
...
|
||||
|
||||
nlp = spacy.load("en_core_trf_lg")
|
||||
nlp.get_pipe("transformer").annotation_setter = custom_annotation_setter
|
||||
doc = nlp("This is a text")
|
||||
print() # TODO:
|
||||
```
|
||||
|
||||
## Training usage {#training}
|
||||
|
||||
The recommended workflow for training is to use spaCy's
|
||||
[config system](/usage/training#config), usually via the
|
||||
[`spacy train`](/api/cli#train) command. The training config defines all
|
||||
component settings and hyperparameters in one place and lets you describe a tree
|
||||
of objects by referring to creation functions, including functions you register
|
||||
yourself. For details on how to get started with training your own model, check
|
||||
out the [training quickstart](/usage/training#quickstart).
|
||||
|
||||
<Project id="en_core_bert">
|
||||
|
||||
The easiest way to get started is to clone a transformers-based project
|
||||
template. Swap in your data, edit the settings and hyperparameters and train,
|
||||
evaluate, package and visualize your model.
|
||||
|
||||
</Project>
|
||||
|
||||
The `[components]` section in the [`config.cfg`](/api/data-formats#config)
|
||||
describes the pipeline components and the settings used to construct them,
|
||||
including their model implementation. Here's a config snippet for the
|
||||
[`Transformer`](/api/transformer) component, along with matching Python code. In
|
||||
this case, the `[components.transformer]` block describes the `transformer`
|
||||
component:
|
||||
|
||||
> #### Python equivalent
|
||||
>
|
||||
> ```python
|
||||
> from spacy_transformers import Transformer, TransformerModel
|
||||
> from spacy_transformers.annotation_setters import null_annotation_setter
|
||||
> from spacy_transformers.span_getters import get_doc_spans
|
||||
>
|
||||
> trf = Transformer(
|
||||
> nlp.vocab,
|
||||
> TransformerModel(
|
||||
> "bert-base-cased",
|
||||
> get_spans=get_doc_spans,
|
||||
> tokenizer_config={"use_fast": True},
|
||||
> ),
|
||||
> annotation_setter=null_annotation_setter,
|
||||
> max_batch_items=4096,
|
||||
> )
|
||||
> ```
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt)
|
||||
[components.transformer]
|
||||
factory = "transformer"
|
||||
max_batch_items = 4096
|
||||
|
||||
[components.transformer.model]
|
||||
@architectures = "spacy-transformers.TransformerModel.v1"
|
||||
name = "bert-base-cased"
|
||||
tokenizer_config = {"use_fast": true}
|
||||
|
||||
[components.transformer.model.get_spans]
|
||||
@span_getters = "doc_spans.v1"
|
||||
|
||||
[components.transformer.annotation_setter]
|
||||
@annotation_setters = "spacy-transformer.null_annotation_setter.v1"
|
||||
|
||||
```
|
||||
|
||||
The `[components.transformer.model]` block describes the `model` argument passed
|
||||
to the transformer component. It's a Thinc
|
||||
[`Model`](https://thinc.ai/docs/api-model) object that will be passed into the
|
||||
component. Here, it references the function
|
||||
[spacy-transformers.TransformerModel.v1](/api/architectures#TransformerModel)
|
||||
registered in the [`architectures` registry](/api/top-level#registry). If a key
|
||||
in a block starts with `@`, it's **resolved to a function** and all other
|
||||
settings are passed to the function as arguments. In this case, `name`,
|
||||
`tokenizer_config` and `get_spans`.
|
||||
|
||||
`get_spans` is a function that takes a batch of `Doc` object and returns lists
|
||||
of potentially overlapping `Span` objects to process by the transformer. Several
|
||||
[built-in functions](/api/transformer#span-getters) are available – for example,
|
||||
to process the whole document or individual sentences. When the config is
|
||||
resolved, the function is created and passed into the model as an argument.
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
||||
Remember that the `config.cfg` used for training should contain **no missing
|
||||
values** and requires all settings to be defined. You don't want any hidden
|
||||
defaults creeping in and changing your results! spaCy will tell you if settings
|
||||
are missing, and you can run
|
||||
[`spacy init fill-config`](/api/cli#init-fill-config) to automatically fill in
|
||||
all defaults.
|
||||
|
||||
</Infobox>
|
||||
|
||||
### Customizing the settings {#training-custom-settings}
|
||||
|
||||
To change any of the settings, you can edit the `config.cfg` and re-run the
|
||||
training. To change any of the functions, like the span getter, you can replace
|
||||
the name of the referenced function – e.g. `@span_getters = "sent_spans.v1"` to
|
||||
process sentences. You can also register your own functions using the
|
||||
`span_getters` registry:
|
||||
|
||||
> #### config.cfg
|
||||
>
|
||||
> ```ini
|
||||
> [components.transformer.model.get_spans]
|
||||
> @span_getters = "custom_sent_spans"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### code.py
|
||||
import spacy_transformers
|
||||
|
||||
@spacy_transformers.registry.span_getters("custom_sent_spans")
|
||||
def configure_custom_sent_spans():
|
||||
# TODO: write custom example
|
||||
def get_sent_spans(docs):
|
||||
return [list(doc.sents) for doc in docs]
|
||||
|
||||
return get_sent_spans
|
||||
```
|
||||
|
||||
To resolve the config during training, spaCy needs to know about your custom
|
||||
function. You can make it available via the `--code` argument that can point to
|
||||
a Python file. For more details on training with custom code, see the
|
||||
[training documentation](/usage/training#custom-code).
|
||||
|
||||
```bash
|
||||
$ python -m spacy train ./config.cfg --code ./code.py
|
||||
```
|
||||
|
||||
### Customizing the model implementations {#training-custom-model}
|
||||
|
||||
The [`Transformer`](/api/transformer) component expects a Thinc
|
||||
[`Model`](https://thinc.ai/docs/api-model) object to be passed in as its `model`
|
||||
argument. You're not limited to the implementation provided by
|
||||
`spacy-transformers` – the only requirement is that your registered function
|
||||
must return an object of type ~~Model[List[Doc], FullTransformerBatch]~~: that
|
||||
is, a Thinc model that takes a list of [`Doc`](/api/doc) objects, and returns a
|
||||
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) object with the
|
||||
transformer data.
|
||||
|
||||
> #### Model type annotations
|
||||
>
|
||||
> In the documentation and code base, you may come across type annotations and
|
||||
> descriptions of [Thinc](https://thinc.ai) model types, like ~~Model[List[Doc],
|
||||
> List[Floats2d]]~~. This so-called generic type describes the layer and its
|
||||
> input and output type – in this case, it takes a list of `Doc` objects as the
|
||||
> input and list of 2-dimensional arrays of floats as the output. You can read
|
||||
> more about defining Thinc models [here](https://thinc.ai/docs/usage-models).
|
||||
> Also see the [type checking](https://thinc.ai/docs/usage-type-checking) for
|
||||
> how to enable linting in your editor to see live feedback if your inputs and
|
||||
> outputs don't match.
|
||||
|
||||
The same idea applies to task models that power the **downstream components**.
|
||||
Most of spaCy's built-in model creation functions support a `tok2vec` argument,
|
||||
which should be a Thinc layer of type `Model[List[Doc], List[Floats2d]]`. This
|
||||
is where we'll plug in our transformer model, using the
|
||||
[Tok2VecListener](/api/architectures#Tok2VecListener) layer, which sneakily
|
||||
delegates to the `Transformer` pipeline component.
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt) {highlight="12"}
|
||||
[components.ner]
|
||||
factory = "ner"
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 3
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy-transformers.Tok2VecListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
```
|
||||
|
||||
The [Tok2VecListener](/api/architectures#Tok2VecListener) layer expects a
|
||||
[pooling layer](https://thinc.ai/docs/api-layers#reduction-ops) as the argument
|
||||
`pooling`, which needs to be of type `Model[Ragged, Floats2d]`. This layer
|
||||
determines how the vector for each spaCy token will be computed from the zero or
|
||||
more source rows the token is aligned against. Here we use the
|
||||
[`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean) layer, which
|
||||
averages the wordpiece rows. We could instead use
|
||||
[`reduce_max`](https://thinc.ai/docs/api-layers#reduce_max), or a custom
|
||||
function you write yourself.
|
||||
|
||||
You can have multiple components all listening to the same transformer model,
|
||||
and all passing gradients back to it. By default, all of the gradients will be
|
||||
**equally weighted**. You can control this with the `grad_factor` setting, which
|
||||
lets you reweight the gradients from the different listeners. For instance,
|
||||
setting `grad_factor = 0` would disable gradients from one of the listeners,
|
||||
while `grad_factor = 2.0` would multiply them by 2. This is similar to having a
|
||||
custom learning rate for each component. Instead of a constant, you can also
|
||||
provide a schedule, allowing you to freeze the shared parameters at the start of
|
||||
training.
|
|
@ -152,6 +152,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
|
|||
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
|
||||
| [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. |
|
||||
| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). |
|
||||
| [`util.load_meta`](/api/top-level#util.load_meta) [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a model's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). |
|
||||
| [`init config`](/api/cli#init-config) [`init fill-config`](/api/cli#init-fill-config) [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). |
|
||||
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
|
||||
|
||||
|
@ -175,6 +176,11 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
|||
There can be many [different models](/models) and not just one "English
|
||||
model", so you should always use the full model name like
|
||||
[`en_core_web_sm`](/models/en) explicitly.
|
||||
- A model's [`meta.json`](/api/data-formats#meta) is now only used to provide
|
||||
meta information like the model name, author, license and labels. It's **not**
|
||||
used to construct the processing pipeline anymore. This is all defined in the
|
||||
[`config.cfg`](/api/data-formats#config), which also includes all settings
|
||||
used to train the model.
|
||||
- The [`train`](/api/cli#train) and [`pretrain`](/api/cli#pretrain) commands now
|
||||
only take a `config.cfg` file containing the full
|
||||
[training config](/usage/training#config).
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
"Floats2d": "https://thinc.ai/docs/api-types#types",
|
||||
"Floats3d": "https://thinc.ai/docs/api-types#types",
|
||||
"FloatsXd": "https://thinc.ai/docs/api-types#types",
|
||||
"Ops": "https://thinc.ai/docs/api-backends#ops",
|
||||
"cymem.Pool": "https://github.com/explosion/cymem",
|
||||
"preshed.BloomFilter": "https://github.com/explosion/preshed",
|
||||
"transformers.BatchEncoding": "https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding",
|
||||
|
|
|
@ -53,7 +53,15 @@ const icons = {
|
|||
package: PackageIcon,
|
||||
}
|
||||
|
||||
export default function Icon({ name, width = 20, height, inline = false, variant, className }) {
|
||||
export default function Icon({
|
||||
name,
|
||||
width = 20,
|
||||
height,
|
||||
inline = false,
|
||||
variant,
|
||||
className,
|
||||
...props
|
||||
}) {
|
||||
const IconComponent = icons[name]
|
||||
const iconClassNames = classNames(classes.root, className, {
|
||||
[classes.inline]: inline,
|
||||
|
@ -67,6 +75,7 @@ export default function Icon({ name, width = 20, height, inline = false, variant
|
|||
aria-hidden="true"
|
||||
width={width}
|
||||
height={height || width}
|
||||
{...props}
|
||||
/>
|
||||
)
|
||||
}
|
||||
|
|
|
@ -9,19 +9,25 @@ function isNum(children) {
|
|||
return isString(children) && /^\d+[.,]?[\dx]+?(|x|ms|mb|gb|k|m)?$/i.test(children)
|
||||
}
|
||||
|
||||
function getCellContent(children) {
|
||||
function getCellContent(cellChildren) {
|
||||
const icons = {
|
||||
'✅': { name: 'yes', variant: 'success' },
|
||||
'❌': { name: 'no', variant: 'error' },
|
||||
'✅': { name: 'yes', variant: 'success', 'aria-label': 'positive' },
|
||||
'❌': { name: 'no', variant: 'error', 'aria-label': 'negative' },
|
||||
}
|
||||
|
||||
if (isString(children) && icons[children.trim()]) {
|
||||
const iconProps = icons[children.trim()]
|
||||
return <Icon {...iconProps} />
|
||||
}
|
||||
// Work around prettier auto-escape
|
||||
if (isString(children) && children.startsWith('\\')) {
|
||||
return children.slice(1)
|
||||
let children = isString(cellChildren) ? [cellChildren] : cellChildren
|
||||
if (Array.isArray(children)) {
|
||||
return children.map((child, i) => {
|
||||
if (isString(child)) {
|
||||
const icon = icons[child.trim()]
|
||||
if (icon) {
|
||||
const props = { ...icon, inline: i < children.length, 'aria-hidden': undefined }
|
||||
return <Icon {...props} key={i} />
|
||||
}
|
||||
// Work around prettier auto-escape
|
||||
if (child.startsWith('\\')) return child.slice(1)
|
||||
}
|
||||
return child
|
||||
})
|
||||
}
|
||||
return children
|
||||
}
|
||||
|
|
|
@ -38,7 +38,8 @@ const DATA = [
|
|||
{
|
||||
id: 'optimize',
|
||||
title: 'Optimize for',
|
||||
help: '...',
|
||||
help:
|
||||
'Optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger & slower model). Will impact the choice of architecture, pretrained weights and hyperparameters.',
|
||||
options: [
|
||||
{ id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' },
|
||||
{ id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' },
|
||||
|
@ -84,10 +85,12 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg'
|
|||
query={query}
|
||||
render={({ site }) => {
|
||||
const langs = site.siteMetadata.languages
|
||||
DATA[0].dropdown = langs.map(({ name, code }) => ({
|
||||
id: code,
|
||||
title: name,
|
||||
}))
|
||||
DATA[0].dropdown = langs
|
||||
.map(({ name, code }) => ({
|
||||
id: code,
|
||||
title: name,
|
||||
}))
|
||||
.sort((a, b) => a.id.localeCompare(b.id))
|
||||
return (
|
||||
<Quickstart
|
||||
download={download}
|
||||
|
|
Loading…
Reference in New Issue
Block a user