mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 17:10:36 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
b42c0d5161
|
@ -6,7 +6,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.0a43,<8.0.0a50",
|
||||
"thinc>=8.0.0a44,<8.0.0a50",
|
||||
"blis>=0.4.0,<0.8.0",
|
||||
"pytokenizations",
|
||||
"pathy"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a43,<8.0.0a50
|
||||
thinc>=8.0.0a44,<8.0.0a50
|
||||
blis>=0.4.0,<0.8.0
|
||||
ml_datasets==0.2.0a0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.0.0a43,<8.0.0a50
|
||||
thinc>=8.0.0a44,<8.0.0a50
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a43,<8.0.0a50
|
||||
thinc>=8.0.0a44,<8.0.0a50
|
||||
blis>=0.4.0,<0.8.0
|
||||
wasabi>=0.8.0,<1.1.0
|
||||
srsly>=2.3.0,<3.0.0
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a36"
|
||||
__version__ = "3.0.0a41"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
|
@ -253,7 +253,7 @@ def _get_converter(msg, converter, input_path):
|
|||
if converter == "auto":
|
||||
converter = input_path.suffix[1:]
|
||||
if converter == "ner" or converter == "iob":
|
||||
with input_path.open() as file_:
|
||||
with input_path.open(encoding="utf8") as file_:
|
||||
input_data = file_.read()
|
||||
converter_autodetect = autodetect_ner_format(input_data)
|
||||
if converter_autodetect == "ner":
|
||||
|
|
|
@ -32,10 +32,10 @@ es:
|
|||
word_vectors: null
|
||||
transformer:
|
||||
efficiency:
|
||||
name: mrm8488/RuPERTa-base
|
||||
name: dccuchile/bert-base-spanish-wwm-cased
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: mrm8488/RuPERTa-base
|
||||
name: dccuchile/bert-base-spanish-wwm-cased
|
||||
size_factor: 3
|
||||
sv:
|
||||
word_vectors: null
|
||||
|
@ -101,3 +101,21 @@ pl:
|
|||
accuracy:
|
||||
name: dkleczek/bert-base-polish-cased-v1
|
||||
size_factor: 3
|
||||
nl:
|
||||
word_vectors: null
|
||||
transformer:
|
||||
efficiency:
|
||||
name: pdelobelle/robbert-v2-dutch-base
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: pdelobelle/robbert-v2-dutch-base
|
||||
size_factor: 3
|
||||
pt:
|
||||
word_vectors: null
|
||||
transformer:
|
||||
efficiency:
|
||||
name: neuralmind/bert-base-portuguese-cased
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: neuralmind/bert-base-portuguese-cased
|
||||
size_factor: 3
|
||||
|
|
|
@ -456,6 +456,17 @@ class Errors:
|
|||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
|
||||
"is not set or None. If you've implemented a custom component, make "
|
||||
"sure to store the component model as `self.model` in your "
|
||||
"component's __init__ method.")
|
||||
E899 = ("Can't serialize trainable pipe '{name}': the `vocab` attribute "
|
||||
"is not set or None. If you've implemented a custom component, make "
|
||||
"sure to store the current `nlp` object's vocab as `self.vocab` in "
|
||||
"your component's __init__ method.")
|
||||
E900 = ("Could not run the full pipeline for evaluation. If you specified "
|
||||
"frozen components, make sure they were already initialized and "
|
||||
"trained. Full pipeline: {pipeline}")
|
||||
E901 = ("Failed to remove existing output directory: {path}. If your "
|
||||
"config and the components you train change between runs, a "
|
||||
"non-empty output directory can lead to stale pipeline data. To "
|
||||
|
|
|
@ -30,7 +30,6 @@ cdef class KnowledgeBase:
|
|||
cdef Pool mem
|
||||
cpdef readonly Vocab vocab
|
||||
cdef int64_t entity_vector_length
|
||||
cdef public set _added_strings
|
||||
|
||||
# This maps 64bit keys (hash of unique entity string)
|
||||
# to 64bit values (position of the _KBEntryC struct in the _entries vector).
|
||||
|
|
15
spacy/kb.pyx
15
spacy/kb.pyx
|
@ -92,7 +92,6 @@ cdef class KnowledgeBase:
|
|||
self._alias_index = PreshMap()
|
||||
self.vocab = vocab
|
||||
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
|
||||
self._added_strings = set()
|
||||
|
||||
@property
|
||||
def entity_vector_length(self):
|
||||
|
@ -114,16 +113,12 @@ cdef class KnowledgeBase:
|
|||
def get_alias_strings(self):
|
||||
return [self.vocab.strings[x] for x in self._alias_index]
|
||||
|
||||
def add_string(self, string: str):
|
||||
self._added_strings.add(string)
|
||||
return self.vocab.strings.add(string)
|
||||
|
||||
def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
|
||||
"""
|
||||
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
||||
Return the hash of the entity ID/name at the end.
|
||||
"""
|
||||
cdef hash_t entity_hash = self.add_string(entity)
|
||||
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
||||
|
||||
# Return if this entity was added before
|
||||
if entity_hash in self._entry_index:
|
||||
|
@ -157,7 +152,7 @@ cdef class KnowledgeBase:
|
|||
cdef hash_t entity_hash
|
||||
while i < len(entity_list):
|
||||
# only process this entity if its unique ID hadn't been added before
|
||||
entity_hash = self.add_string(entity_list[i])
|
||||
entity_hash = self.vocab.strings.add(entity_list[i])
|
||||
if entity_hash in self._entry_index:
|
||||
warnings.warn(Warnings.W018.format(entity=entity_list[i]))
|
||||
|
||||
|
@ -203,7 +198,7 @@ cdef class KnowledgeBase:
|
|||
if prob_sum > 1.00001:
|
||||
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
|
||||
|
||||
cdef hash_t alias_hash = self.add_string(alias)
|
||||
cdef hash_t alias_hash = self.vocab.strings.add(alias)
|
||||
|
||||
# Check whether this alias was added before
|
||||
if alias_hash in self._alias_index:
|
||||
|
@ -332,7 +327,7 @@ cdef class KnowledgeBase:
|
|||
raise ValueError(Errors.E928.format(loc=path))
|
||||
serialize = {}
|
||||
serialize["contents"] = lambda p: self.write_contents(p)
|
||||
serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings)
|
||||
serialize["strings.json"] = lambda p: self.vocab.strings.to_disk(p)
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||
|
@ -343,7 +338,7 @@ cdef class KnowledgeBase:
|
|||
raise ValueError(Errors.E928.format(loc=path))
|
||||
deserialize = {}
|
||||
deserialize["contents"] = lambda p: self.read_contents(p)
|
||||
deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)]
|
||||
deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p)
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
|
||||
def write_contents(self, file_path):
|
||||
|
|
|
@ -62,6 +62,7 @@ _ordinal_words = [
|
|||
|
||||
_ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü")
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
|
@ -75,11 +76,11 @@ def like_num(text):
|
|||
|
||||
text_lower = text.lower()
|
||||
|
||||
#Check cardinal number
|
||||
# Check cardinal number
|
||||
if text_lower in _num_words:
|
||||
return True
|
||||
|
||||
#Check ordinal number
|
||||
# Check ordinal number
|
||||
if text_lower in _ordinal_words:
|
||||
return True
|
||||
if text_lower.endswith(_ordinal_endings):
|
||||
|
|
|
@ -51,9 +51,8 @@ def noun_chunks(doclike):
|
|||
elif word.dep == conj:
|
||||
cc_token = word.left_edge
|
||||
prev_end = cc_token.i
|
||||
yield cc_token.right_edge.i + 1, extend_right(word), np_label # Shave off cc tokens from the NP
|
||||
|
||||
|
||||
# Shave off cc tokens from the NP
|
||||
yield cc_token.right_edge.i + 1, extend_right(word), np_label
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern
|
||||
from typing import Tuple, Iterator
|
||||
from typing import Tuple
|
||||
from dataclasses import dataclass
|
||||
import random
|
||||
import itertools
|
||||
|
@ -1034,6 +1034,9 @@ class Language:
|
|||
)
|
||||
)
|
||||
disable = to_disable
|
||||
# DisabledPipes will restore the pipes in 'disable' when it's done, so we need to exclude
|
||||
# those pipes that were already disabled.
|
||||
disable = [d for d in disable if d not in self._disabled]
|
||||
return DisabledPipes(self, disable)
|
||||
|
||||
def make_doc(self, text: str) -> Doc:
|
||||
|
@ -1194,7 +1197,9 @@ class Language:
|
|||
doc = Doc(self.vocab, words=["x", "y", "z"])
|
||||
get_examples = lambda: [Example.from_dict(doc, {})]
|
||||
if not hasattr(get_examples, "__call__"):
|
||||
err = Errors.E930.format(method="Language.initialize", obj=type(get_examples))
|
||||
err = Errors.E930.format(
|
||||
method="Language.initialize", obj=type(get_examples)
|
||||
)
|
||||
raise TypeError(err)
|
||||
# Make sure the config is interpolated so we can resolve subsections
|
||||
config = self.config.interpolate()
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from pathlib import Path
|
||||
from typing import Optional, Callable, Iterable
|
||||
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
||||
from thinc.api import Model, Maxout, Linear
|
||||
|
@ -25,7 +26,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
|
|||
|
||||
|
||||
@registry.misc.register("spacy.KBFromFile.v1")
|
||||
def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
|
||||
def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
|
||||
def kb_from_file(vocab):
|
||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||
kb.from_disk(kb_path)
|
||||
|
|
|
@ -24,11 +24,11 @@ def build_simple_cnn_text_classifier(
|
|||
"""
|
||||
with Model.define_operators({">>": chain}):
|
||||
if exclusive_classes:
|
||||
output_layer = Softmax(nO=nO, nI=tok2vec.get_dim("nO"))
|
||||
output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
|
||||
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
|
||||
model.set_ref("output_layer", output_layer)
|
||||
else:
|
||||
linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO"))
|
||||
linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
|
||||
model = (
|
||||
tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
|
||||
)
|
||||
|
|
|
@ -110,7 +110,7 @@ def MultiHashEmbed(
|
|||
|
||||
The features used can be configured with the 'attrs' argument. The suggested
|
||||
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
|
||||
account some subword information, without construction a fully character-based
|
||||
account some subword information, without constructing a fully character-based
|
||||
representation. If pretrained vectors are available, they can be included in
|
||||
the representation as well, with the vectors table will be kept static
|
||||
(i.e. it's not updated).
|
||||
|
@ -177,7 +177,7 @@ def CharacterEmbed(
|
|||
rows: int,
|
||||
nM: int,
|
||||
nC: int,
|
||||
also_use_static_vectors: bool,
|
||||
include_static_vectors: bool,
|
||||
feature: Union[int, str] = "LOWER",
|
||||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
"""Construct an embedded representation based on character embeddings, using
|
||||
|
@ -204,13 +204,13 @@ def CharacterEmbed(
|
|||
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
||||
are between 3 and 8, although it may depend on the length of words in the
|
||||
language.
|
||||
also_use_static_vectors (bool): Whether to also use static word vectors.
|
||||
include_static_vectors (bool): Whether to also use static word vectors.
|
||||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||
"""
|
||||
feature = intify_attr(feature)
|
||||
if feature is None:
|
||||
raise ValueError(Errors.E911(feat=feature))
|
||||
if also_use_static_vectors:
|
||||
if include_static_vectors:
|
||||
model = chain(
|
||||
concatenate(
|
||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||
|
|
|
@ -39,7 +39,6 @@ def forward(
|
|||
key_attr = model.attrs["key_attr"]
|
||||
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
|
||||
V = cast(Floats2d, docs[0].vocab.vectors.data)
|
||||
mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate"))
|
||||
rows = model.ops.flatten(
|
||||
[doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
|
||||
)
|
||||
|
@ -47,8 +46,11 @@ def forward(
|
|||
model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True),
|
||||
model.ops.asarray([len(doc) for doc in docs], dtype="i"),
|
||||
)
|
||||
if mask is not None:
|
||||
output.data *= mask
|
||||
mask = None
|
||||
if is_train:
|
||||
mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate"))
|
||||
if mask is not None:
|
||||
output.data *= mask
|
||||
|
||||
def backprop(d_output: Ragged) -> List[Doc]:
|
||||
if mask is not None:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import List, Dict, Union, Iterable, Any, Optional, Callable, Iterator
|
||||
from typing import List, Dict, Union, Iterable, Any, Optional, Callable
|
||||
from typing import Tuple
|
||||
import srsly
|
||||
from pathlib import Path
|
||||
|
@ -57,7 +57,6 @@ class AttributeRuler(Pipe):
|
|||
self.attrs = []
|
||||
self._attrs_unnormed = [] # store for reference
|
||||
self.indices = []
|
||||
self._added_strings = set()
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Reset all patterns."""
|
||||
|
@ -187,16 +186,12 @@ class AttributeRuler(Pipe):
|
|||
# We need to make a string here, because otherwise the ID we pass back
|
||||
# will be interpreted as the hash of a string, rather than an ordinal.
|
||||
key = str(len(self.attrs))
|
||||
self.matcher.add(self.add_string(key), patterns)
|
||||
self.matcher.add(self.vocab.strings.add(key), patterns)
|
||||
self._attrs_unnormed.append(attrs)
|
||||
attrs = normalize_token_attrs(self.vocab, attrs)
|
||||
self.attrs.append(attrs)
|
||||
self.indices.append(index)
|
||||
|
||||
def add_string(self, string: str):
|
||||
self._added_strings.add(string)
|
||||
return self.vocab.strings.add(string)
|
||||
|
||||
def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None:
|
||||
"""Add patterns from a list of pattern dicts with the keys as the
|
||||
arguments to AttributeRuler.add.
|
||||
|
@ -256,8 +251,8 @@ class AttributeRuler(Pipe):
|
|||
DOCS: https://nightly.spacy.io/api/attributeruler#to_bytes
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["vocab"] = self.vocab.to_bytes
|
||||
serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns)
|
||||
serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings))
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(
|
||||
|
@ -276,7 +271,7 @@ class AttributeRuler(Pipe):
|
|||
self.add_patterns(srsly.msgpack_loads(b))
|
||||
|
||||
deserialize = {
|
||||
"strings.json": lambda b: [self.add_string(s) for s in srsly.json_loads(b)],
|
||||
"vocab": lambda b: self.vocab.from_bytes(b),
|
||||
"patterns": load_patterns,
|
||||
}
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
|
@ -293,7 +288,7 @@ class AttributeRuler(Pipe):
|
|||
DOCS: https://nightly.spacy.io/api/attributeruler#to_disk
|
||||
"""
|
||||
serialize = {
|
||||
"strings.json": lambda p: srsly.write_json(p, self._added_strings),
|
||||
"vocab": lambda p: self.vocab.to_disk(p),
|
||||
"patterns": lambda p: srsly.write_msgpack(p, self.patterns),
|
||||
}
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
@ -314,7 +309,7 @@ class AttributeRuler(Pipe):
|
|||
self.add_patterns(srsly.read_msgpack(p))
|
||||
|
||||
deserialize = {
|
||||
"strings.json": lambda p: [self.add_string(s) for s in srsly.read_json(p)],
|
||||
"vocab": lambda p: self.vocab.from_disk(p),
|
||||
"patterns": load_patterns,
|
||||
}
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
|
|
|
@ -453,6 +453,7 @@ class EntityLinker(TrainablePipe):
|
|||
DOCS: https://nightly.spacy.io/api/entitylinker#to_disk
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||
serialize["kb"] = lambda p: self.kb.to_disk(p)
|
||||
serialize["model"] = lambda p: self.model.to_disk(p)
|
||||
|
@ -481,8 +482,6 @@ class EntityLinker(TrainablePipe):
|
|||
deserialize["kb"] = lambda p: self.kb.from_disk(p)
|
||||
deserialize["model"] = load_model
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
for s in self.kb._added_strings:
|
||||
self.vocab.strings.add(s)
|
||||
return self
|
||||
|
||||
def rehearse(self, examples, *, sgd=None, losses=None, **config):
|
||||
|
|
|
@ -281,6 +281,7 @@ class Lemmatizer(Pipe):
|
|||
DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||
serialize["lookups"] = lambda p: self.lookups.to_disk(p)
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
|
@ -296,6 +297,7 @@ class Lemmatizer(Pipe):
|
|||
DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
|
||||
"""
|
||||
deserialize = {}
|
||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
||||
deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
self._validate_tables()
|
||||
|
@ -310,6 +312,7 @@ class Lemmatizer(Pipe):
|
|||
DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["vocab"] = self.vocab.to_bytes
|
||||
serialize["lookups"] = self.lookups.to_bytes
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
|
@ -325,6 +328,7 @@ class Lemmatizer(Pipe):
|
|||
DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
|
||||
"""
|
||||
deserialize = {}
|
||||
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
||||
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
self._validate_tables()
|
||||
|
|
|
@ -32,7 +32,7 @@ width = 128
|
|||
rows = 7000
|
||||
nM = 64
|
||||
nC = 8
|
||||
also_use_static_vectors = false
|
||||
include_static_vectors = false
|
||||
|
||||
[model.tok2vec.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
|
@ -95,7 +95,6 @@ class Morphologizer(Tagger):
|
|||
# add mappings for empty morph
|
||||
self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
|
||||
self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
|
||||
self._added_strings = set()
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -129,7 +128,6 @@ class Morphologizer(Tagger):
|
|||
label_dict.pop(self.POS_FEAT)
|
||||
# normalize morph string and add to morphology table
|
||||
norm_morph = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
||||
self.add_string(norm_morph)
|
||||
# add label mappings
|
||||
if norm_label not in self.cfg["labels_morph"]:
|
||||
self.cfg["labels_morph"][norm_label] = norm_morph
|
||||
|
@ -161,7 +159,6 @@ class Morphologizer(Tagger):
|
|||
if pos:
|
||||
morph_dict[self.POS_FEAT] = pos
|
||||
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
|
||||
self.add_string(norm_label)
|
||||
# add label->morph and label->POS mappings
|
||||
if norm_label not in self.cfg["labels_morph"]:
|
||||
self.cfg["labels_morph"][norm_label] = morph
|
||||
|
@ -179,7 +176,6 @@ class Morphologizer(Tagger):
|
|||
if pos:
|
||||
morph_dict[self.POS_FEAT] = pos
|
||||
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
|
||||
self.add_string(norm_label)
|
||||
gold_array.append([1.0 if label == norm_label else 0.0 for label in self.labels])
|
||||
doc_sample.append(example.x)
|
||||
label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
|
||||
|
@ -238,7 +234,6 @@ class Morphologizer(Tagger):
|
|||
if pos:
|
||||
label_dict[self.POS_FEAT] = pos
|
||||
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
||||
self.add_string(label)
|
||||
eg_truths.append(label)
|
||||
truths.append(eg_truths)
|
||||
d_scores, loss = loss_func(scores, truths)
|
||||
|
|
|
@ -61,7 +61,6 @@ class SentenceRecognizer(Tagger):
|
|||
self.name = name
|
||||
self._rehearsal_model = None
|
||||
self.cfg = {}
|
||||
self._added_strings = set()
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
|
|
@ -78,7 +78,6 @@ class Tagger(TrainablePipe):
|
|||
self._rehearsal_model = None
|
||||
cfg = {"labels": labels or []}
|
||||
self.cfg = dict(sorted(cfg.items()))
|
||||
self._added_strings = set()
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -313,7 +312,7 @@ class Tagger(TrainablePipe):
|
|||
return 0
|
||||
self._allow_extra_label()
|
||||
self.cfg["labels"].append(label)
|
||||
self.add_string(label)
|
||||
self.vocab.strings.add(label)
|
||||
return 1
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
|
|
|
@ -110,7 +110,6 @@ class TextCategorizer(TrainablePipe):
|
|||
self._rehearsal_model = None
|
||||
cfg = {"labels": [], "threshold": threshold, "positive_label": None}
|
||||
self.cfg = dict(cfg)
|
||||
self._added_strings = set()
|
||||
|
||||
@property
|
||||
def labels(self) -> Tuple[str]:
|
||||
|
@ -301,7 +300,7 @@ class TextCategorizer(TrainablePipe):
|
|||
return 0
|
||||
self._allow_extra_label()
|
||||
self.cfg["labels"].append(label)
|
||||
self.add_string(label)
|
||||
self.vocab.strings.add(label)
|
||||
return 1
|
||||
|
||||
def initialize(
|
||||
|
|
|
@ -64,7 +64,6 @@ class Tok2Vec(TrainablePipe):
|
|||
self.name = name
|
||||
self.listeners = []
|
||||
self.cfg = {}
|
||||
self._added_strings = set()
|
||||
|
||||
def add_listener(self, listener: "Tok2VecListener") -> None:
|
||||
"""Add a listener for a downstream component. Usually internals."""
|
||||
|
|
|
@ -5,4 +5,3 @@ cdef class TrainablePipe(Pipe):
|
|||
cdef public Vocab vocab
|
||||
cdef public object model
|
||||
cdef public object cfg
|
||||
cdef public set _added_strings
|
||||
|
|
|
@ -13,6 +13,7 @@ from ..vocab import Vocab
|
|||
from ..language import Language
|
||||
from ..training import Example
|
||||
|
||||
|
||||
cdef class TrainablePipe(Pipe):
|
||||
"""This class is a base class and not instantiated directly. Trainable
|
||||
pipeline components like the EntityRecognizer or TextCategorizer inherit
|
||||
|
@ -35,7 +36,6 @@ cdef class TrainablePipe(Pipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self.cfg = dict(cfg)
|
||||
self._added_strings = set()
|
||||
|
||||
def __call__(self, Doc doc) -> Doc:
|
||||
"""Apply the pipe to one document. The document is modified in place,
|
||||
|
@ -198,10 +198,6 @@ cdef class TrainablePipe(Pipe):
|
|||
"""
|
||||
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
|
||||
|
||||
def add_string(self, string: str):
|
||||
self._added_strings.add(string)
|
||||
return self.vocab.strings.add(string)
|
||||
|
||||
@property
|
||||
def is_trainable(self) -> bool:
|
||||
return True
|
||||
|
@ -244,6 +240,16 @@ cdef class TrainablePipe(Pipe):
|
|||
"""
|
||||
self.model.finish_update(sgd)
|
||||
|
||||
def _validate_serialization_attrs(self):
|
||||
"""Check that the pipe implements the required attributes. If a subclass
|
||||
implements a custom __init__ method but doesn't set these attributes,
|
||||
they currently default to None, so we need to perform additonal checks.
|
||||
"""
|
||||
if not hasattr(self, "vocab") or self.vocab is None:
|
||||
raise ValueError(Errors.E899.format(name=util.get_object_name(self)))
|
||||
if not hasattr(self, "model") or self.model is None:
|
||||
raise ValueError(Errors.E898.format(name=util.get_object_name(self)))
|
||||
|
||||
def to_bytes(self, *, exclude=tuple()):
|
||||
"""Serialize the pipe to a bytestring.
|
||||
|
||||
|
@ -252,11 +258,12 @@ cdef class TrainablePipe(Pipe):
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/pipe#to_bytes
|
||||
"""
|
||||
self._validate_serialization_attrs()
|
||||
serialize = {}
|
||||
if hasattr(self, "cfg"):
|
||||
if hasattr(self, "cfg") and self.cfg is not None:
|
||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||
serialize["vocab"] = self.vocab.to_bytes
|
||||
serialize["model"] = self.model.to_bytes
|
||||
serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings))
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
||||
|
@ -267,6 +274,7 @@ cdef class TrainablePipe(Pipe):
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/pipe#from_bytes
|
||||
"""
|
||||
self._validate_serialization_attrs()
|
||||
|
||||
def load_model(b):
|
||||
try:
|
||||
|
@ -275,9 +283,9 @@ cdef class TrainablePipe(Pipe):
|
|||
raise ValueError(Errors.E149) from None
|
||||
|
||||
deserialize = {}
|
||||
deserialize["strings.json"] = lambda b: [self.add_string(s) for s in srsly.json_loads(b)]
|
||||
if hasattr(self, "cfg"):
|
||||
if hasattr(self, "cfg") and self.cfg is not None:
|
||||
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
|
||||
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
||||
deserialize["model"] = load_model
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
return self
|
||||
|
@ -290,10 +298,11 @@ cdef class TrainablePipe(Pipe):
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/pipe#to_disk
|
||||
"""
|
||||
self._validate_serialization_attrs()
|
||||
serialize = {}
|
||||
if hasattr(self, "cfg"):
|
||||
if hasattr(self, "cfg") and self.cfg is not None:
|
||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||
serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings)
|
||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||
serialize["model"] = lambda p: self.model.to_disk(p)
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
|
@ -306,6 +315,7 @@ cdef class TrainablePipe(Pipe):
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/pipe#from_disk
|
||||
"""
|
||||
self._validate_serialization_attrs()
|
||||
|
||||
def load_model(p):
|
||||
try:
|
||||
|
@ -314,9 +324,9 @@ cdef class TrainablePipe(Pipe):
|
|||
raise ValueError(Errors.E149) from None
|
||||
|
||||
deserialize = {}
|
||||
deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)]
|
||||
if hasattr(self, "cfg"):
|
||||
if hasattr(self, "cfg") and self.cfg is not None:
|
||||
deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
|
||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
||||
deserialize["model"] = load_model
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
|
|
@ -76,7 +76,6 @@ cdef class Parser(TrainablePipe):
|
|||
self.add_multitask_objective(multitask)
|
||||
|
||||
self._rehearsal_model = None
|
||||
self._added_strings = set()
|
||||
|
||||
def __getnewargs_ex__(self):
|
||||
"""This allows pickling the Parser and its keyword-only init arguments"""
|
||||
|
@ -120,7 +119,7 @@ cdef class Parser(TrainablePipe):
|
|||
resized = True
|
||||
if resized:
|
||||
self._resize()
|
||||
self.add_string(label)
|
||||
self.vocab.strings.add(label)
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
@ -456,24 +455,24 @@ cdef class Parser(TrainablePipe):
|
|||
|
||||
def to_disk(self, path, exclude=tuple()):
|
||||
serializers = {
|
||||
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
|
||||
'strings.json': lambda p: srsly.write_json(p, self._added_strings),
|
||||
'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]),
|
||||
'cfg': lambda p: srsly.write_json(p, self.cfg)
|
||||
"model": lambda p: (self.model.to_disk(p) if self.model is not True else True),
|
||||
"vocab": lambda p: self.vocab.to_disk(p),
|
||||
"moves": lambda p: self.moves.to_disk(p, exclude=["strings"]),
|
||||
"cfg": lambda p: srsly.write_json(p, self.cfg)
|
||||
}
|
||||
util.to_disk(path, serializers, exclude)
|
||||
|
||||
def from_disk(self, path, exclude=tuple()):
|
||||
deserializers = {
|
||||
'strings.json': lambda p: [self.add_string(s) for s in srsly.read_json(p)],
|
||||
'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]),
|
||||
'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
|
||||
'model': lambda p: None,
|
||||
"vocab": lambda p: self.vocab.from_disk(p),
|
||||
"moves": lambda p: self.moves.from_disk(p, exclude=["strings"]),
|
||||
"cfg": lambda p: self.cfg.update(srsly.read_json(p)),
|
||||
"model": lambda p: None,
|
||||
}
|
||||
util.from_disk(path, deserializers, exclude)
|
||||
if 'model' not in exclude:
|
||||
if "model" not in exclude:
|
||||
path = util.ensure_path(path)
|
||||
with (path / 'model').open('rb') as file_:
|
||||
with (path / "model").open("rb") as file_:
|
||||
bytes_data = file_.read()
|
||||
try:
|
||||
self._resize()
|
||||
|
@ -485,7 +484,7 @@ cdef class Parser(TrainablePipe):
|
|||
def to_bytes(self, exclude=tuple()):
|
||||
serializers = {
|
||||
"model": lambda: (self.model.to_bytes()),
|
||||
"strings.json": lambda: srsly.json_dumps(sorted(self._added_strings)),
|
||||
"vocab": lambda: self.vocab.to_bytes(),
|
||||
"moves": lambda: self.moves.to_bytes(exclude=["strings"]),
|
||||
"cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)
|
||||
}
|
||||
|
@ -493,7 +492,7 @@ cdef class Parser(TrainablePipe):
|
|||
|
||||
def from_bytes(self, bytes_data, exclude=tuple()):
|
||||
deserializers = {
|
||||
"strings.json": lambda b: [self.add_string(s) for s in srsly.json_loads(b)],
|
||||
"vocab": lambda b: self.vocab.from_bytes(b),
|
||||
"moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]),
|
||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
||||
"model": lambda b: None,
|
||||
|
|
|
@ -282,7 +282,7 @@ class ModelMetaSchema(BaseModel):
|
|||
sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
|
||||
vectors: Dict[str, Any] = Field({}, title="Included word vectors")
|
||||
labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
|
||||
performance: Dict[str, Union[float, Dict[str, Union[float, dict]]]] = Field({}, title="Accuracy and speed numbers")
|
||||
performance: Dict[str, Any] = Field({}, title="Accuracy and speed numbers")
|
||||
spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
|
||||
# fmt: on
|
||||
|
||||
|
|
|
@ -239,10 +239,12 @@ def th_tokenizer():
|
|||
def tr_tokenizer():
|
||||
return get_lang_class("tr")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tr_vocab():
|
||||
return get_lang_class("tr").Defaults.create_vocab()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tt_tokenizer():
|
||||
return get_lang_class("tt")().tokenizer
|
||||
|
|
|
@ -608,14 +608,11 @@ def test_doc_init_iob():
|
|||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_doc_set_ents_spans(en_tokenizer):
|
||||
def test_doc_set_ents_invalid_spans(en_tokenizer):
|
||||
doc = en_tokenizer("Some text about Colombia and the Czech Republic")
|
||||
spans = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")]
|
||||
with doc.retokenize() as retokenizer:
|
||||
for span in spans:
|
||||
retokenizer.merge(span)
|
||||
# If this line is uncommented, it works:
|
||||
# print(spans)
|
||||
doc.ents = spans
|
||||
assert [ent.text for ent in doc.ents] == ["Colombia", "Czech Republic"]
|
||||
with pytest.raises(IndexError):
|
||||
doc.ents = spans
|
||||
|
|
|
@ -336,6 +336,7 @@ def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
|
|||
attrs = {"lemma": "none", "ent_type": "none"}
|
||||
retokenizer.merge(doc[0:2], attrs=attrs)
|
||||
retokenizer.merge(doc[-2:], attrs=attrs)
|
||||
sent1, sent2 = list(doc.sents)
|
||||
assert len(sent1) == init_len - 1
|
||||
assert len(sent2) == init_len2 - 1
|
||||
|
||||
|
|
|
@ -225,7 +225,7 @@ def test_tr_noun_chunks_acl_nmod(tr_tokenizer):
|
|||
assert chunks[0].text_with_ws == "en sevdiğim ses sanatçısı "
|
||||
|
||||
|
||||
def test_tr_noun_chunks_acl_nmod(tr_tokenizer):
|
||||
def test_tr_noun_chunks_acl_nmod2(tr_tokenizer):
|
||||
text = "bildiğim bir turizm şirketi"
|
||||
heads = [3, 3, 3, 3]
|
||||
deps = ["acl", "det", "nmod", "ROOT"]
|
||||
|
@ -326,7 +326,7 @@ def test_tr_noun_chunks_np_recursive_no_nmod(tr_tokenizer):
|
|||
def test_tr_noun_chunks_np_recursive_long_two_acls(tr_tokenizer):
|
||||
text = "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo"
|
||||
heads = [6, 2, 3, 5, 5, 6, 9, 6, 9, 9]
|
||||
deps = ["obl", "nmod" , "obl", "acl", "det", "nsubj", "acl", "aux", "det", "ROOT"]
|
||||
deps = ["obl", "nmod", "obl", "acl", "det", "nsubj", "acl", "aux", "det", "ROOT"]
|
||||
pos = ["ADP", "PROPN", "NOUN", "VERB", "DET", "NOUN", "VERB", "AUX", "DET", "NOUN"]
|
||||
tokens = tr_tokenizer(text)
|
||||
doc = Doc(
|
||||
|
@ -334,7 +334,10 @@ def test_tr_noun_chunks_np_recursive_long_two_acls(tr_tokenizer):
|
|||
)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].text_with_ws == "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo "
|
||||
assert (
|
||||
chunks[0].text_with_ws
|
||||
== "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo "
|
||||
)
|
||||
|
||||
|
||||
def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
|
||||
|
@ -350,7 +353,8 @@ def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
|
|||
assert len(chunks) == 1
|
||||
assert chunks[0].text_with_ws == "kız ve erkek çocuklar "
|
||||
|
||||
def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
|
||||
|
||||
def test_tr_noun_chunks_two_nouns_in_nmod2(tr_tokenizer):
|
||||
text = "tatlı ve gürbüz çocuklar"
|
||||
heads = [3, 2, 0, 3]
|
||||
deps = ["amod", "cc", "conj", "ROOT"]
|
||||
|
@ -378,6 +382,7 @@ def test_tr_noun_chunks_conj_simple(tr_tokenizer):
|
|||
assert chunks[0].text_with_ws == "ben "
|
||||
assert chunks[1].text_with_ws == "Sen "
|
||||
|
||||
|
||||
def test_tr_noun_chunks_conj_three(tr_tokenizer):
|
||||
text = "sen, ben ve ondan"
|
||||
heads = [0, 2, 0, 4, 0]
|
||||
|
@ -394,7 +399,7 @@ def test_tr_noun_chunks_conj_three(tr_tokenizer):
|
|||
assert chunks[2].text_with_ws == "sen "
|
||||
|
||||
|
||||
def test_tr_noun_chunks_conj_three(tr_tokenizer):
|
||||
def test_tr_noun_chunks_conj_three2(tr_tokenizer):
|
||||
text = "ben ya da sen ya da onlar"
|
||||
heads = [0, 3, 1, 0, 6, 4, 3]
|
||||
deps = ["ROOT", "cc", "fixed", "conj", "cc", "fixed", "conj"]
|
||||
|
@ -499,7 +504,7 @@ def test_tr_noun_chunks_flat_names_and_title(tr_tokenizer):
|
|||
assert chunks[0].text_with_ws == "Gazi Mustafa Kemal "
|
||||
|
||||
|
||||
def test_tr_noun_chunks_flat_names_and_title(tr_tokenizer):
|
||||
def test_tr_noun_chunks_flat_names_and_title2(tr_tokenizer):
|
||||
text = "Ahmet Vefik Paşa"
|
||||
heads = [2, 0, 2]
|
||||
deps = ["nmod", "flat", "ROOT"]
|
||||
|
|
|
@ -15,8 +15,8 @@ from spacy.lang.tr.lex_attrs import like_num
|
|||
"üçüncü",
|
||||
"beşinci",
|
||||
"100üncü",
|
||||
"8inci"
|
||||
]
|
||||
"8inci",
|
||||
],
|
||||
)
|
||||
def test_tr_lex_attrs_like_number_cardinal_ordinal(word):
|
||||
assert like_num(word)
|
||||
|
@ -26,4 +26,3 @@ def test_tr_lex_attrs_like_number_cardinal_ordinal(word):
|
|||
def test_tr_lex_attrs_capitals(word):
|
||||
assert like_num(word)
|
||||
assert like_num(word.upper())
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ from spacy.kb import KnowledgeBase, get_candidates, Candidate
|
|||
from spacy.vocab import Vocab
|
||||
|
||||
from spacy import util, registry
|
||||
from spacy.ml import load_kb
|
||||
from spacy.scorer import Scorer
|
||||
from spacy.training import Example
|
||||
from spacy.lang.en import English
|
||||
|
@ -121,9 +122,7 @@ def test_kb_default(nlp):
|
|||
|
||||
def test_kb_custom_length(nlp):
|
||||
"""Test that the default (empty) KB can be configured with a custom entity length"""
|
||||
entity_linker = nlp.add_pipe(
|
||||
"entity_linker", config={"entity_vector_length": 35}
|
||||
)
|
||||
entity_linker = nlp.add_pipe("entity_linker", config={"entity_vector_length": 35})
|
||||
assert len(entity_linker.kb) == 0
|
||||
assert entity_linker.kb.get_size_entities() == 0
|
||||
assert entity_linker.kb.get_size_aliases() == 0
|
||||
|
@ -213,16 +212,11 @@ def test_el_pipe_configuration(nlp):
|
|||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||
kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
||||
kb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
||||
kb.add_alias(
|
||||
alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]
|
||||
)
|
||||
kb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
|
||||
return kb
|
||||
|
||||
# run an EL pipe without a trained context encoder, to check the candidate generation step only
|
||||
entity_linker = nlp.add_pipe(
|
||||
"entity_linker",
|
||||
config={"incl_context": False},
|
||||
)
|
||||
entity_linker = nlp.add_pipe("entity_linker", config={"incl_context": False})
|
||||
entity_linker.set_kb(create_kb)
|
||||
# With the default get_candidates function, matching is case-sensitive
|
||||
text = "Douglas and douglas are not the same."
|
||||
|
@ -453,14 +447,10 @@ def test_overfitting_IO():
|
|||
return mykb
|
||||
|
||||
# Create the Entity Linker component and add it to the pipeline
|
||||
entity_linker = nlp.add_pipe(
|
||||
"entity_linker",
|
||||
last=True,
|
||||
)
|
||||
entity_linker = nlp.add_pipe("entity_linker", last=True)
|
||||
entity_linker.set_kb(create_kb)
|
||||
assert "Q2146908" in entity_linker.vocab.strings
|
||||
assert "Q2146908" in entity_linker.kb.vocab.strings
|
||||
assert "Q2146908" in entity_linker.kb._added_strings
|
||||
|
||||
# train the NEL pipe
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
@ -507,6 +497,32 @@ def test_overfitting_IO():
|
|||
assert predictions == GOLD_entities
|
||||
|
||||
|
||||
def test_kb_serialization():
|
||||
# Test that the KB can be used in a pipeline with a different vocab
|
||||
vector_length = 3
|
||||
with make_tempdir() as tmp_dir:
|
||||
kb_dir = tmp_dir / "kb"
|
||||
nlp1 = English()
|
||||
assert "Q2146908" not in nlp1.vocab.strings
|
||||
mykb = KnowledgeBase(nlp1.vocab, entity_vector_length=vector_length)
|
||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
||||
assert "Q2146908" in nlp1.vocab.strings
|
||||
mykb.to_disk(kb_dir)
|
||||
|
||||
nlp2 = English()
|
||||
assert "RandomWord" not in nlp2.vocab.strings
|
||||
nlp2.vocab.strings.add("RandomWord")
|
||||
assert "RandomWord" in nlp2.vocab.strings
|
||||
assert "Q2146908" not in nlp2.vocab.strings
|
||||
|
||||
# Create the Entity Linker component with the KB from file, and check the final vocab
|
||||
entity_linker = nlp2.add_pipe("entity_linker", last=True)
|
||||
entity_linker.set_kb(load_kb(kb_dir))
|
||||
assert "Q2146908" in nlp2.vocab.strings
|
||||
assert "RandomWord" in nlp2.vocab.strings
|
||||
|
||||
|
||||
def test_scorer_links():
|
||||
train_examples = []
|
||||
nlp = English()
|
||||
|
|
|
@ -101,4 +101,3 @@ def test_overfitting_IO():
|
|||
doc2 = nlp2(test_text)
|
||||
assert [str(t.morph) for t in doc2] == gold_morphs
|
||||
assert [t.pos_ for t in doc2] == gold_pos_tags
|
||||
assert nlp.get_pipe("morphologizer")._added_strings == nlp2.get_pipe("morphologizer")._added_strings
|
||||
|
|
|
@ -129,6 +129,7 @@ def test_enable_pipes_method(nlp, name):
|
|||
|
||||
@pytest.mark.parametrize("name", ["my_component"])
|
||||
def test_disable_pipes_context(nlp, name):
|
||||
"""Test that an enabled component stays enabled after running the context manager."""
|
||||
nlp.add_pipe("new_pipe", name=name)
|
||||
assert nlp.has_pipe(name)
|
||||
with nlp.select_pipes(disable=name):
|
||||
|
@ -136,6 +137,18 @@ def test_disable_pipes_context(nlp, name):
|
|||
assert nlp.has_pipe(name)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("name", ["my_component"])
|
||||
def test_disable_pipes_context_restore(nlp, name):
|
||||
"""Test that a disabled component stays disabled after running the context manager."""
|
||||
nlp.add_pipe("new_pipe", name=name)
|
||||
assert nlp.has_pipe(name)
|
||||
nlp.disable_pipe(name)
|
||||
assert not nlp.has_pipe(name)
|
||||
with nlp.select_pipes(disable=name):
|
||||
assert not nlp.has_pipe(name)
|
||||
assert not nlp.has_pipe(name)
|
||||
|
||||
|
||||
def test_select_pipes_list_arg(nlp):
|
||||
for name in ["c1", "c2", "c3"]:
|
||||
nlp.add_pipe("new_pipe", name=name)
|
||||
|
|
|
@ -80,4 +80,3 @@ def test_overfitting_IO():
|
|||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
doc2 = nlp2(test_text)
|
||||
assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts
|
||||
assert nlp.get_pipe("senter")._added_strings == nlp2.get_pipe("senter")._added_strings
|
||||
|
|
|
@ -98,7 +98,6 @@ def test_overfitting_IO():
|
|||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["tagger"] < 0.00001
|
||||
assert tagger._added_strings == {"J", "N", "V"}
|
||||
|
||||
# test the trained model
|
||||
test_text = "I like blue eggs"
|
||||
|
@ -117,7 +116,6 @@ def test_overfitting_IO():
|
|||
assert doc2[1].tag_ is "V"
|
||||
assert doc2[2].tag_ is "J"
|
||||
assert doc2[3].tag_ is "N"
|
||||
assert nlp2.get_pipe("tagger")._added_strings == {"J", "N", "V"}
|
||||
|
||||
|
||||
def test_tagger_requires_labels():
|
||||
|
|
|
@ -146,7 +146,6 @@ def test_overfitting_IO():
|
|||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert textcat.model.get_dim("nO") == 2
|
||||
assert textcat._added_strings == {"NEGATIVE", "POSITIVE"}
|
||||
|
||||
for i in range(50):
|
||||
losses = {}
|
||||
|
@ -168,7 +167,6 @@ def test_overfitting_IO():
|
|||
cats2 = doc2.cats
|
||||
assert cats2["POSITIVE"] > 0.9
|
||||
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001)
|
||||
assert nlp2.get_pipe("textcat")._added_strings == {"NEGATIVE", "POSITIVE"}
|
||||
|
||||
# Test scoring
|
||||
scores = nlp.evaluate(train_examples)
|
||||
|
|
|
@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
|||
[
|
||||
(8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
|
||||
(8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
|
||||
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
|
||||
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
|
||||
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
|
||||
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
|
||||
],
|
||||
)
|
||||
# fmt: on
|
||||
|
|
|
@ -7,6 +7,7 @@ from spacy.kb import KnowledgeBase, Writer
|
|||
from spacy.vectors import Vectors
|
||||
from spacy.language import Language
|
||||
from spacy.pipeline import TrainablePipe
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
@ -50,8 +51,9 @@ def custom_pipe():
|
|||
else:
|
||||
self.cfg = None
|
||||
self.model = SerializableDummy()
|
||||
self.vocab = vocab
|
||||
|
||||
return MyPipe(None)
|
||||
return MyPipe(Vocab())
|
||||
|
||||
|
||||
def tagger():
|
||||
|
|
|
@ -6,8 +6,8 @@ def test_issue6207(en_tokenizer):
|
|||
|
||||
# Make spans
|
||||
s1 = doc[:4]
|
||||
s2 = doc[3:6] # overlaps with s1
|
||||
s3 = doc[5:7] # overlaps with s2, not s1
|
||||
s2 = doc[3:6] # overlaps with s1
|
||||
s3 = doc[5:7] # overlaps with s2, not s1
|
||||
|
||||
result = filter_spans((s1, s2, s3))
|
||||
assert s1 in result
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
import pytest
|
||||
import srsly
|
||||
from spacy import registry, Vocab
|
||||
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
|
||||
from spacy.pipeline import TextCategorizer, SentenceRecognizer
|
||||
from spacy.pipeline import TextCategorizer, SentenceRecognizer, TrainablePipe
|
||||
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||
from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
|
||||
from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
|
||||
from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
|
||||
from spacy.lang.en import English
|
||||
from thinc.api import Linear
|
||||
import spacy
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
@ -89,7 +89,6 @@ def test_serialize_parser_strings(Parser):
|
|||
assert label not in vocab2.strings
|
||||
parser2 = Parser(vocab2, model, **config)
|
||||
parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"]))
|
||||
assert parser1._added_strings == parser2._added_strings == {"FunnyLabel"}
|
||||
assert label in parser2.vocab.strings
|
||||
|
||||
|
||||
|
@ -166,17 +165,13 @@ def test_serialize_tagger_strings(en_vocab, de_vocab, taggers):
|
|||
# check that custom labels are serialized as part of the component's strings.jsonl
|
||||
tagger.add_label(label)
|
||||
assert label in tagger.vocab.strings
|
||||
assert tagger._added_strings == {label}
|
||||
file_path = d / "tagger1"
|
||||
tagger.to_disk(file_path)
|
||||
strings = srsly.read_json(file_path / "strings.json")
|
||||
assert strings == ["SomeWeirdLabel"]
|
||||
# ensure that the custom strings are loaded back in when using the tagger in another pipeline
|
||||
cfg = {"model": DEFAULT_TAGGER_MODEL}
|
||||
model = registry.resolve(cfg, validate=True)["model"]
|
||||
tagger2 = Tagger(de_vocab, model).from_disk(file_path)
|
||||
assert label in tagger2.vocab.strings
|
||||
assert tagger2._added_strings == {label}
|
||||
|
||||
|
||||
def test_serialize_textcat_empty(en_vocab):
|
||||
|
@ -253,3 +248,40 @@ def test_serialize_pipeline_disable_enable():
|
|||
assert nlp5.pipe_names == ["ner"]
|
||||
assert nlp5.component_names == ["ner"]
|
||||
assert nlp5.disabled == []
|
||||
|
||||
|
||||
def test_serialize_custom_trainable_pipe():
|
||||
class BadCustomPipe1(TrainablePipe):
|
||||
def __init__(self, vocab):
|
||||
pass
|
||||
|
||||
class BadCustomPipe2(TrainablePipe):
|
||||
def __init__(self, vocab):
|
||||
self.vocab = vocab
|
||||
self.model = None
|
||||
|
||||
class CustomPipe(TrainablePipe):
|
||||
def __init__(self, vocab, model):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
||||
pipe = BadCustomPipe1(Vocab())
|
||||
with pytest.raises(ValueError):
|
||||
pipe.to_bytes()
|
||||
with make_tempdir() as d:
|
||||
with pytest.raises(ValueError):
|
||||
pipe.to_disk(d)
|
||||
pipe = BadCustomPipe2(Vocab())
|
||||
with pytest.raises(ValueError):
|
||||
pipe.to_bytes()
|
||||
with make_tempdir() as d:
|
||||
with pytest.raises(ValueError):
|
||||
pipe.to_disk(d)
|
||||
pipe = CustomPipe(Vocab(), Linear())
|
||||
pipe_bytes = pipe.to_bytes()
|
||||
new_pipe = CustomPipe(Vocab(), Linear()).from_bytes(pipe_bytes)
|
||||
assert new_pipe.to_bytes() == pipe_bytes
|
||||
with make_tempdir() as d:
|
||||
pipe.to_disk(d)
|
||||
new_pipe = CustomPipe(Vocab(), Linear()).from_disk(d)
|
||||
assert new_pipe.to_bytes() == pipe_bytes
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
from typing import List
|
||||
|
||||
import pytest
|
||||
from thinc.api import fix_random_seed, Adam, set_dropout_rate
|
||||
from numpy.testing import assert_array_equal
|
||||
import numpy
|
||||
|
||||
from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
|
||||
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
|
||||
from spacy.ml.staticvectors import StaticVectors
|
||||
|
@ -188,12 +186,7 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
|
|||
assert_array_equal(get_all_params(model1), get_all_params(model2))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_func,kwargs",
|
||||
[
|
||||
(StaticVectors, {"nO": 128, "nM": 300}),
|
||||
]
|
||||
)
|
||||
@pytest.mark.parametrize("model_func,kwargs", [(StaticVectors, {"nO": 128, "nM": 300})])
|
||||
def test_empty_docs(model_func, kwargs):
|
||||
nlp = English()
|
||||
model = model_func(**kwargs).initialize()
|
||||
|
@ -201,7 +194,7 @@ def test_empty_docs(model_func, kwargs):
|
|||
for n_docs in range(3):
|
||||
docs = [nlp("") for _ in range(n_docs)]
|
||||
# Test predict
|
||||
_ = model.predict(docs)
|
||||
model.predict(docs)
|
||||
# Test backprop
|
||||
output, backprop = model.begin_update(docs)
|
||||
_ = backprop(output)
|
||||
backprop(output)
|
||||
|
|
|
@ -1419,7 +1419,7 @@ cdef class Doc:
|
|||
if include_annotation["POS"]:
|
||||
token_data["pos"] = token.pos_
|
||||
if include_annotation["MORPH"]:
|
||||
token_data["morph"] = token.morph
|
||||
token_data["morph"] = token.morph.to_json()
|
||||
if include_annotation["LEMMA"]:
|
||||
token_data["lemma"] = token.lemma_
|
||||
if include_annotation["DEP"]:
|
||||
|
|
|
@ -16,5 +16,4 @@ cdef class Span:
|
|||
cdef public _vector
|
||||
cdef public _vector_norm
|
||||
|
||||
cpdef int _recalculate_indices(self) except -1
|
||||
cpdef np.ndarray to_array(self, object features)
|
||||
|
|
|
@ -150,7 +150,6 @@ cdef class Span:
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/span#len
|
||||
"""
|
||||
self._recalculate_indices()
|
||||
if self.end < self.start:
|
||||
return 0
|
||||
return self.end - self.start
|
||||
|
@ -167,7 +166,6 @@ cdef class Span:
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/span#getitem
|
||||
"""
|
||||
self._recalculate_indices()
|
||||
if isinstance(i, slice):
|
||||
start, end = normalize_slice(len(self), i.start, i.stop, i.step)
|
||||
return Span(self.doc, start + self.start, end + self.start)
|
||||
|
@ -188,7 +186,6 @@ cdef class Span:
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/span#iter
|
||||
"""
|
||||
self._recalculate_indices()
|
||||
for i in range(self.start, self.end):
|
||||
yield self.doc[i]
|
||||
|
||||
|
@ -339,19 +336,6 @@ cdef class Span:
|
|||
output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature)
|
||||
return output
|
||||
|
||||
cpdef int _recalculate_indices(self) except -1:
|
||||
if self.end > self.doc.length \
|
||||
or self.doc.c[self.start].idx != self.start_char \
|
||||
or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char:
|
||||
start = token_by_start(self.doc.c, self.doc.length, self.start_char)
|
||||
if self.start == -1:
|
||||
raise IndexError(Errors.E036.format(start=self.start_char))
|
||||
end = token_by_end(self.doc.c, self.doc.length, self.end_char)
|
||||
if end == -1:
|
||||
raise IndexError(Errors.E037.format(end=self.end_char))
|
||||
self.start = start
|
||||
self.end = end + 1
|
||||
|
||||
@property
|
||||
def vocab(self):
|
||||
"""RETURNS (Vocab): The Span's Doc's vocab."""
|
||||
|
@ -520,7 +504,6 @@ cdef class Span:
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/span#root
|
||||
"""
|
||||
self._recalculate_indices()
|
||||
if "root" in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks["root"](self)
|
||||
# This should probably be called 'head', and the other one called
|
||||
|
|
|
@ -11,11 +11,25 @@ if TYPE_CHECKING:
|
|||
from ..language import Language # noqa: F401
|
||||
|
||||
|
||||
def setup_table(
|
||||
*, cols: List[str], widths: List[int], max_width: int = 13
|
||||
) -> Tuple[List[str], List[int], List[str]]:
|
||||
final_cols = []
|
||||
final_widths = []
|
||||
for col, width in zip(cols, widths):
|
||||
if len(col) > max_width:
|
||||
col = col[: max_width - 3] + "..." # shorten column if too long
|
||||
final_cols.append(col.upper())
|
||||
final_widths.append(max(len(col), width))
|
||||
return final_cols, final_widths, ["r" for _ in final_widths]
|
||||
|
||||
|
||||
@registry.loggers("spacy.ConsoleLogger.v1")
|
||||
def console_logger(progress_bar: bool = False):
|
||||
def setup_printer(
|
||||
nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
|
||||
) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]:
|
||||
write = lambda text: stdout.write(f"{text}\n")
|
||||
msg = Printer(no_print=True)
|
||||
# ensure that only trainable components are logged
|
||||
logged_pipes = [
|
||||
|
@ -26,15 +40,14 @@ def console_logger(progress_bar: bool = False):
|
|||
eval_frequency = nlp.config["training"]["eval_frequency"]
|
||||
score_weights = nlp.config["training"]["score_weights"]
|
||||
score_cols = [col for col, value in score_weights.items() if value is not None]
|
||||
score_widths = [max(len(col), 6) for col in score_cols]
|
||||
loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
|
||||
loss_widths = [max(len(col), 8) for col in loss_cols]
|
||||
table_header = ["E", "#"] + loss_cols + score_cols + ["Score"]
|
||||
table_header = [col.upper() for col in table_header]
|
||||
table_widths = [3, 6] + loss_widths + score_widths + [6]
|
||||
table_aligns = ["r" for _ in table_widths]
|
||||
stdout.write(msg.row(table_header, widths=table_widths) + "\n")
|
||||
stdout.write(msg.row(["-" * width for width in table_widths]) + "\n")
|
||||
spacing = 2
|
||||
table_header, table_widths, table_aligns = setup_table(
|
||||
cols=["E", "#"] + loss_cols + score_cols + ["Score"],
|
||||
widths=[3, 6] + [8 for _ in loss_cols] + [6 for _ in score_cols] + [6],
|
||||
)
|
||||
write(msg.row(table_header, widths=table_widths, spacing=spacing))
|
||||
write(msg.row(["-" * width for width in table_widths], spacing=spacing))
|
||||
progress = None
|
||||
|
||||
def log_step(info: Optional[Dict[str, Any]]) -> None:
|
||||
|
@ -70,7 +83,9 @@ def console_logger(progress_bar: bool = False):
|
|||
)
|
||||
if progress is not None:
|
||||
progress.close()
|
||||
stdout.write(msg.row(data, widths=table_widths, aligns=table_aligns) + "\n")
|
||||
write(
|
||||
msg.row(data, widths=table_widths, aligns=table_aligns, spacing=spacing)
|
||||
)
|
||||
if progress_bar:
|
||||
# Set disable=None, so that it disables on non-TTY
|
||||
progress = tqdm.tqdm(
|
||||
|
|
|
@ -249,7 +249,10 @@ def create_evaluation_callback(
|
|||
|
||||
def evaluate() -> Tuple[float, Dict[str, float]]:
|
||||
dev_examples = list(dev_corpus(nlp))
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
try:
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
except KeyError as e:
|
||||
raise KeyError(Errors.E900.format(pipeline=nlp.pipe_names)) from e
|
||||
# Calculate a weighted sum based on score_weights for the main score.
|
||||
# We can only consider scores that are ints/floats, not dicts like
|
||||
# entity scores per type etc.
|
||||
|
|
|
@ -622,7 +622,7 @@ def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
|
|||
if not path.parent.exists():
|
||||
raise IOError(Errors.E052.format(path=path.parent))
|
||||
if not path.exists() or not path.is_file():
|
||||
raise IOError(Errors.E053.format(path=path, name="meta.json"))
|
||||
raise IOError(Errors.E053.format(path=path.parent, name="meta.json"))
|
||||
meta = srsly.read_json(path)
|
||||
for setting in ["lang", "name", "version"]:
|
||||
if setting not in meta or not meta[setting]:
|
||||
|
@ -821,7 +821,7 @@ def get_object_name(obj: Any) -> str:
|
|||
obj (Any): The Python object, typically a function or class.
|
||||
RETURNS (str): A human-readable name.
|
||||
"""
|
||||
if hasattr(obj, "name"):
|
||||
if hasattr(obj, "name") and obj.name is not None:
|
||||
return obj.name
|
||||
if hasattr(obj, "__name__"):
|
||||
return obj.__name__
|
||||
|
@ -1361,11 +1361,12 @@ def check_bool_env_var(env_var: str) -> bool:
|
|||
def _pipe(docs, proc, kwargs):
|
||||
if hasattr(proc, "pipe"):
|
||||
yield from proc.pipe(docs, **kwargs)
|
||||
# We added some args for pipe that __call__ doesn't expect.
|
||||
kwargs = dict(kwargs)
|
||||
for arg in ["batch_size"]:
|
||||
if arg in kwargs:
|
||||
kwargs.pop(arg)
|
||||
for doc in docs:
|
||||
doc = proc(doc, **kwargs)
|
||||
yield doc
|
||||
else:
|
||||
# We added some args for pipe that __call__ doesn't expect.
|
||||
kwargs = dict(kwargs)
|
||||
for arg in ["batch_size"]:
|
||||
if arg in kwargs:
|
||||
kwargs.pop(arg)
|
||||
for doc in docs:
|
||||
doc = proc(doc, **kwargs)
|
||||
yield doc
|
||||
|
|
|
@ -637,13 +637,6 @@ into the "real world". This requires 3 main components:
|
|||
> window_size = 1
|
||||
> maxout_pieces = 3
|
||||
> subword_features = true
|
||||
>
|
||||
> [kb_loader]
|
||||
> @misc = "spacy.EmptyKB.v1"
|
||||
> entity_vector_length = 64
|
||||
>
|
||||
> [get_candidates]
|
||||
> @misc = "spacy.CandidateGenerator.v1"
|
||||
> ```
|
||||
|
||||
The `EntityLinker` model architecture is a Thinc `Model` with a
|
||||
|
@ -657,13 +650,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
|
|||
|
||||
### spacy.EmptyKB.v1 {#EmptyKB}
|
||||
|
||||
A function that creates a default, empty `KnowledgeBase` from a
|
||||
[`Vocab`](/api/vocab) instance.
|
||||
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
|
||||
instance. This is the default when a new entity linker component is created.
|
||||
|
||||
| Name | Description |
|
||||
| ---------------------- | ----------------------------------------------------------------------------------- |
|
||||
| `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
|
||||
|
||||
### spacy.KBFromFile.v1 {#KBFromFile}
|
||||
|
||||
A function that reads an existing `KnowledgeBase` from file.
|
||||
|
||||
| Name | Description |
|
||||
| --------- | -------------------------------------------------------- |
|
||||
| `kb_path` | The location of the KB that was stored to file. ~~Path~~ |
|
||||
|
||||
### spacy.CandidateGenerator.v1 {#CandidateGenerator}
|
||||
|
||||
A function that takes as input a [`KnowledgeBase`](/api/kb) and a
|
||||
|
|
|
@ -34,20 +34,20 @@ architectures and their arguments and hyperparameters.
|
|||
> "incl_prior": True,
|
||||
> "incl_context": True,
|
||||
> "model": DEFAULT_NEL_MODEL,
|
||||
> "kb_loader": {'@misc': 'spacy.EmptyKB.v1', 'entity_vector_length': 64},
|
||||
> "entity_vector_length": 64,
|
||||
> "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'},
|
||||
> }
|
||||
> nlp.add_pipe("entity_linker", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
||||
| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. Defaults to [EmptyKB](/api/architectures#EmptyKB), a function returning an empty `KnowledgeBase` with an `entity_vector_length` of `64`. ~~Callable[[Vocab], KnowledgeBase]~~ |
|
||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||
| Setting | Description |
|
||||
| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
||||
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
|
||||
|
@ -65,10 +65,6 @@ architectures and their arguments and hyperparameters.
|
|||
> config = {"model": {"@architectures": "my_el.v1"}}
|
||||
> entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||
>
|
||||
> # Construction via add_pipe with custom KB and candidate generation
|
||||
> config = {"kb": {"@misc": "my_kb.v1"}}
|
||||
> entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||
>
|
||||
> # Construction from class
|
||||
> from spacy.pipeline import EntityLinker
|
||||
> entity_linker = EntityLinker(nlp.vocab, model)
|
||||
|
@ -76,21 +72,25 @@ architectures and their arguments and hyperparameters.
|
|||
|
||||
Create a new pipeline instance. In your application, you would normally use a
|
||||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe). Note that both the internal
|
||||
`KnowledgeBase` as well as the Candidate generator can be customized by
|
||||
providing custom registered functions.
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
| Name | Description |
|
||||
| ---------------- | -------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ |
|
||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||
| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
|
||||
Upon construction of the entity linker component, an empty knowledge base is
|
||||
constructed with the provided `entity_vector_length`. If you want to use a
|
||||
custom knowledge base, you should either call
|
||||
[`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the
|
||||
[`initialize`](/api/entitylinker#initialize) call.
|
||||
|
||||
| Name | Description |
|
||||
| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ |
|
||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||
| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
|
||||
|
||||
## EntityLinker.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -139,6 +139,28 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## EntityLinker.set_kb {#initialize tag="method" new="3"}
|
||||
|
||||
The `kb_loader` should be a function that takes a `Vocab` instance and creates
|
||||
the `KnowledgeBase`, ensuring that the strings of the knowledge base are synced
|
||||
with the current vocab.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> def create_kb(vocab):
|
||||
> kb = KnowledgeBase(vocab, entity_vector_length=128)
|
||||
> kb.add_entity(...)
|
||||
> kb.add_alias(...)
|
||||
> return kb
|
||||
> entity_linker = nlp.add_pipe("entity_linker")
|
||||
> entity_linker.set_kb(lambda: [], nlp=nlp, kb_loader=create_kb)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------------------------------- |
|
||||
| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ |
|
||||
|
||||
## EntityLinker.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the component for training. `get_examples` should be a function that
|
||||
|
@ -150,6 +172,11 @@ network,
|
|||
setting up the label scheme based on the data. This method is typically called
|
||||
by [`Language.initialize`](/api/language#initialize).
|
||||
|
||||
Optionally, a `kb_loader` argument may be specified to change the internal
|
||||
knowledge base. This argument should be a function that takes a `Vocab` instance
|
||||
and creates the `KnowledgeBase`, ensuring that the strings of the knowledge base
|
||||
are synced with the current vocab.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||
|
||||
This method was previously called `begin_training`.
|
||||
|
@ -160,7 +187,7 @@ This method was previously called `begin_training`.
|
|||
>
|
||||
> ```python
|
||||
> entity_linker = nlp.add_pipe("entity_linker")
|
||||
> entity_linker.initialize(lambda: [], nlp=nlp)
|
||||
> entity_linker.initialize(lambda: [], nlp=nlp, kb_loader=my_kb)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -168,6 +195,7 @@ This method was previously called `begin_training`.
|
|||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ |
|
||||
|
||||
## EntityLinker.predict {#predict tag="method"}
|
||||
|
||||
|
|
|
@ -516,17 +516,15 @@ Many neural network models are able to use word vector tables as additional
|
|||
features, which sometimes results in significant improvements in accuracy.
|
||||
spaCy's built-in embedding layer,
|
||||
[MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use
|
||||
word vector tables using the `also_use_static_vectors` flag. This setting is
|
||||
also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN)
|
||||
layer, which builds the default token-to-vector encoding architecture.
|
||||
word vector tables using the `include_static_vectors` flag.
|
||||
|
||||
```ini
|
||||
[tagger.model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
width = 128
|
||||
rows = 7000
|
||||
also_embed_subwords = true
|
||||
also_use_static_vectors = true
|
||||
attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"]
|
||||
rows = [5000,2500,2500,2500]
|
||||
include_static_vectors = true
|
||||
```
|
||||
|
||||
<Infobox title="How it works" emoji="💡">
|
||||
|
|
|
@ -1403,9 +1403,9 @@ especially useful it you want to pass in a string instead of calling
|
|||
|
||||
This example shows the implementation of a pipeline component that fetches
|
||||
country meta data via the [REST Countries API](https://restcountries.eu), sets
|
||||
entity annotations for countries, merges entities into one token and sets custom
|
||||
attributes on the `Doc`, `Span` and `Token` – for example, the capital,
|
||||
latitude/longitude coordinates and even the country flag.
|
||||
entity annotations for countries and sets custom attributes on the `Doc` and
|
||||
`Span` – for example, the capital, latitude/longitude coordinates and even the
|
||||
country flag.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
|
@ -1427,54 +1427,46 @@ class RESTCountriesComponent:
|
|||
# Set up the PhraseMatcher with Doc patterns for each country name
|
||||
self.matcher = PhraseMatcher(nlp.vocab)
|
||||
self.matcher.add("COUNTRIES", [nlp.make_doc(c) for c in self.countries.keys()])
|
||||
# Register attribute on the Token. We'll be overwriting this based on
|
||||
# Register attributes on the Span. We'll be overwriting this based on
|
||||
# the matches, so we're only setting a default value, not a getter.
|
||||
Token.set_extension("is_country", default=False)
|
||||
Token.set_extension("country_capital", default=False)
|
||||
Token.set_extension("country_latlng", default=False)
|
||||
Token.set_extension("country_flag", default=False)
|
||||
# Register attributes on Doc and Span via a getter that checks if one of
|
||||
# the contained tokens is set to is_country == True.
|
||||
Span.set_extension("is_country", default=None)
|
||||
Span.set_extension("country_capital", default=None)
|
||||
Span.set_extension("country_latlng", default=None)
|
||||
Span.set_extension("country_flag", default=None)
|
||||
# Register attribute on Doc via a getter that checks if the Doc
|
||||
# contains a country entity
|
||||
Doc.set_extension("has_country", getter=self.has_country)
|
||||
Span.set_extension("has_country", getter=self.has_country)
|
||||
|
||||
def __call__(self, doc):
|
||||
spans = [] # keep the spans for later so we can merge them afterwards
|
||||
for _, start, end in self.matcher(doc):
|
||||
# Generate Span representing the entity & set label
|
||||
entity = Span(doc, start, end, label=self.label)
|
||||
# Set custom attributes on entity. Can be extended with other data
|
||||
# returned by the API, like currencies, country code, calling code etc.
|
||||
entity._.set("is_country", True)
|
||||
entity._.set("country_capital", self.countries[entity.text]["capital"])
|
||||
entity._.set("country_latlng", self.countries[entity.text]["latlng"])
|
||||
entity._.set("country_flag", self.countries[entity.text]["flag"])
|
||||
spans.append(entity)
|
||||
# Set custom attribute on each token of the entity
|
||||
# Can be extended with other data returned by the API, like
|
||||
# currencies, country code, flag, calling code etc.
|
||||
for token in entity:
|
||||
token._.set("is_country", True)
|
||||
token._.set("country_capital", self.countries[entity.text]["capital"])
|
||||
token._.set("country_latlng", self.countries[entity.text]["latlng"])
|
||||
token._.set("country_flag", self.countries[entity.text]["flag"])
|
||||
# Iterate over all spans and merge them into one token
|
||||
with doc.retokenize() as retokenizer:
|
||||
for span in spans:
|
||||
retokenizer.merge(span)
|
||||
# Overwrite doc.ents and add entity – be careful not to replace!
|
||||
doc.ents = list(doc.ents) + spans
|
||||
return doc # don't forget to return the Doc!
|
||||
|
||||
def has_country(self, tokens):
|
||||
"""Getter for Doc and Span attributes. Since the getter is only called
|
||||
when we access the attribute, we can refer to the Token's 'is_country'
|
||||
def has_country(self, doc):
|
||||
"""Getter for Doc attributes. Since the getter is only called
|
||||
when we access the attribute, we can refer to the Span's 'is_country'
|
||||
attribute here, which is already set in the processing step."""
|
||||
return any([t._.get("is_country") for t in tokens])
|
||||
return any([entity._.get("is_country") for entity in doc.ents])
|
||||
|
||||
nlp = English()
|
||||
nlp.add_pipe("rest_countries", config={"label": "GPE"})
|
||||
doc = nlp("Some text about Colombia and the Czech Republic")
|
||||
print("Pipeline", nlp.pipe_names) # pipeline contains component name
|
||||
print("Doc has countries", doc._.has_country) # Doc contains countries
|
||||
for token in doc:
|
||||
if token._.is_country:
|
||||
print(token.text, token._.country_capital, token._.country_latlng, token._.country_flag)
|
||||
print("Entities", [(e.text, e.label_) for e in doc.ents])
|
||||
for ent in doc.ents:
|
||||
if ent._.is_country:
|
||||
print(ent.text, ent.label_, ent._.country_capital, ent._.country_latlng, ent._.country_flag)
|
||||
```
|
||||
|
||||
In this case, all data can be fetched on initialization in one request. However,
|
||||
|
|
Loading…
Reference in New Issue
Block a user