mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-15 11:00:34 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
4f47f33793
2
Makefile
2
Makefile
|
@ -29,7 +29,7 @@ dist/$(SPACY_BIN) : $(WHEELHOUSE)/spacy-$(PYVER)-$(version).stamp
|
||||||
--disable-cache \
|
--disable-cache \
|
||||||
-o $@ \
|
-o $@ \
|
||||||
$(package)==$(version) \
|
$(package)==$(version) \
|
||||||
$(SPACY_EXTRAS)
|
"$(SPACY_EXTRAS)"
|
||||||
chmod a+rx $@
|
chmod a+rx $@
|
||||||
cp $@ dist/spacy.pex
|
cp $@ dist/spacy.pex
|
||||||
|
|
||||||
|
|
|
@ -65,9 +65,11 @@ console_scripts =
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
lookups =
|
lookups =
|
||||||
spacy_lookups_data==1.0.0rc0
|
spacy_lookups_data>=1.0.0rc0,<1.0.0
|
||||||
transformers =
|
transformers =
|
||||||
spacy_transformers>=1.0.0a17,<1.0.0
|
spacy_transformers>=1.0.0a17,<1.0.0
|
||||||
|
ray =
|
||||||
|
spacy_ray>=0.1.0,<1.0.0
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4,<9.0.0
|
cupy>=5.0.0b4,<9.0.0
|
||||||
cuda80 =
|
cuda80 =
|
||||||
|
|
|
@ -843,7 +843,7 @@ class Language:
|
||||||
*,
|
*,
|
||||||
config: Dict[str, Any] = SimpleFrozenDict(),
|
config: Dict[str, Any] = SimpleFrozenDict(),
|
||||||
validate: bool = True,
|
validate: bool = True,
|
||||||
) -> None:
|
) -> Callable[[Doc], Doc]:
|
||||||
"""Replace a component in the pipeline.
|
"""Replace a component in the pipeline.
|
||||||
|
|
||||||
name (str): Name of the component to replace.
|
name (str): Name of the component to replace.
|
||||||
|
@ -852,6 +852,7 @@ class Language:
|
||||||
component. Will be merged with default config, if available.
|
component. Will be merged with default config, if available.
|
||||||
validate (bool): Whether to validate the component config against the
|
validate (bool): Whether to validate the component config against the
|
||||||
arguments and types expected by the factory.
|
arguments and types expected by the factory.
|
||||||
|
RETURNS (Callable[[Doc], Doc]): The new pipeline component.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/language#replace_pipe
|
DOCS: https://nightly.spacy.io/api/language#replace_pipe
|
||||||
"""
|
"""
|
||||||
|
@ -866,9 +867,11 @@ class Language:
|
||||||
self.remove_pipe(name)
|
self.remove_pipe(name)
|
||||||
if not len(self._components) or pipe_index == len(self._components):
|
if not len(self._components) or pipe_index == len(self._components):
|
||||||
# we have no components to insert before/after, or we're replacing the last component
|
# we have no components to insert before/after, or we're replacing the last component
|
||||||
self.add_pipe(factory_name, name=name, config=config, validate=validate)
|
return self.add_pipe(
|
||||||
|
factory_name, name=name, config=config, validate=validate
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.add_pipe(
|
return self.add_pipe(
|
||||||
factory_name,
|
factory_name,
|
||||||
name=name,
|
name=name,
|
||||||
before=pipe_index,
|
before=pipe_index,
|
||||||
|
@ -1300,7 +1303,11 @@ class Language:
|
||||||
kwargs.setdefault("batch_size", batch_size)
|
kwargs.setdefault("batch_size", batch_size)
|
||||||
# non-trainable components may have a pipe() implementation that refers to dummy
|
# non-trainable components may have a pipe() implementation that refers to dummy
|
||||||
# predict and set_annotations methods
|
# predict and set_annotations methods
|
||||||
if not hasattr(pipe, "pipe") or not hasattr(pipe, "is_trainable") or not pipe.is_trainable():
|
if (
|
||||||
|
not hasattr(pipe, "pipe")
|
||||||
|
or not hasattr(pipe, "is_trainable")
|
||||||
|
or not pipe.is_trainable()
|
||||||
|
):
|
||||||
docs = _pipe(docs, pipe, kwargs)
|
docs = _pipe(docs, pipe, kwargs)
|
||||||
else:
|
else:
|
||||||
docs = pipe.pipe(docs, **kwargs)
|
docs = pipe.pipe(docs, **kwargs)
|
||||||
|
@ -1412,7 +1419,11 @@ class Language:
|
||||||
kwargs.setdefault("batch_size", batch_size)
|
kwargs.setdefault("batch_size", batch_size)
|
||||||
# non-trainable components may have a pipe() implementation that refers to dummy
|
# non-trainable components may have a pipe() implementation that refers to dummy
|
||||||
# predict and set_annotations methods
|
# predict and set_annotations methods
|
||||||
if hasattr(proc, "pipe") and hasattr(proc, "is_trainable") and proc.is_trainable():
|
if (
|
||||||
|
hasattr(proc, "pipe")
|
||||||
|
and hasattr(proc, "is_trainable")
|
||||||
|
and proc.is_trainable()
|
||||||
|
):
|
||||||
f = functools.partial(proc.pipe, **kwargs)
|
f = functools.partial(proc.pipe, **kwargs)
|
||||||
else:
|
else:
|
||||||
# Apply the function, but yield the doc
|
# Apply the function, but yield the doc
|
||||||
|
|
|
@ -53,10 +53,18 @@ class AttributeRuler(Pipe):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.matcher = Matcher(self.vocab, validate=validate)
|
self.matcher = Matcher(self.vocab, validate=validate)
|
||||||
|
self.validate = validate
|
||||||
self.attrs = []
|
self.attrs = []
|
||||||
self._attrs_unnormed = [] # store for reference
|
self._attrs_unnormed = [] # store for reference
|
||||||
self.indices = []
|
self.indices = []
|
||||||
|
|
||||||
|
def clear(self) -> None:
|
||||||
|
"""Reset all patterns."""
|
||||||
|
self.matcher = Matcher(self.vocab, validate=self.validate)
|
||||||
|
self.attrs = []
|
||||||
|
self._attrs_unnormed = []
|
||||||
|
self.indices = []
|
||||||
|
|
||||||
def initialize(
|
def initialize(
|
||||||
self,
|
self,
|
||||||
get_examples: Optional[Callable[[], Iterable[Example]]],
|
get_examples: Optional[Callable[[], Iterable[Example]]],
|
||||||
|
@ -65,13 +73,14 @@ class AttributeRuler(Pipe):
|
||||||
patterns: Optional[Iterable[AttributeRulerPatternType]] = None,
|
patterns: Optional[Iterable[AttributeRulerPatternType]] = None,
|
||||||
tag_map: Optional[TagMapType] = None,
|
tag_map: Optional[TagMapType] = None,
|
||||||
morph_rules: Optional[MorphRulesType] = None,
|
morph_rules: Optional[MorphRulesType] = None,
|
||||||
):
|
) -> None:
|
||||||
"""Initialize the attribute ruler by adding zero or more patterns.
|
"""Initialize the attribute ruler by adding zero or more patterns.
|
||||||
|
|
||||||
Rules can be specified as a sequence of dicts using the `patterns`
|
Rules can be specified as a sequence of dicts using the `patterns`
|
||||||
keyword argument. You can also provide rules using the "tag map" or
|
keyword argument. You can also provide rules using the "tag map" or
|
||||||
"morph rules" formats supported by spaCy prior to v3.
|
"morph rules" formats supported by spaCy prior to v3.
|
||||||
"""
|
"""
|
||||||
|
self.clear()
|
||||||
if patterns:
|
if patterns:
|
||||||
self.add_patterns(patterns)
|
self.add_patterns(patterns)
|
||||||
if tag_map:
|
if tag_map:
|
||||||
|
|
|
@ -8,6 +8,7 @@ from thinc.api import set_dropout_rate
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from ..kb import KnowledgeBase, Candidate
|
from ..kb import KnowledgeBase, Candidate
|
||||||
|
from ..ml import empty_kb
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from .pipe import Pipe, deserialize_config
|
from .pipe import Pipe, deserialize_config
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
@ -41,11 +42,11 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
||||||
assigns=["token.ent_kb_id"],
|
assigns=["token.ent_kb_id"],
|
||||||
default_config={
|
default_config={
|
||||||
"kb_loader": {"@misc": "spacy.EmptyKB.v1", "entity_vector_length": 64},
|
|
||||||
"model": DEFAULT_NEL_MODEL,
|
"model": DEFAULT_NEL_MODEL,
|
||||||
"labels_discard": [],
|
"labels_discard": [],
|
||||||
"incl_prior": True,
|
"incl_prior": True,
|
||||||
"incl_context": True,
|
"incl_context": True,
|
||||||
|
"entity_vector_length": 64,
|
||||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
|
@ -58,11 +59,11 @@ def make_entity_linker(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
model: Model,
|
model: Model,
|
||||||
kb_loader: Callable[[Vocab], KnowledgeBase],
|
|
||||||
*,
|
*,
|
||||||
labels_discard: Iterable[str],
|
labels_discard: Iterable[str],
|
||||||
incl_prior: bool,
|
incl_prior: bool,
|
||||||
incl_context: bool,
|
incl_context: bool,
|
||||||
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
|
||||||
):
|
):
|
||||||
"""Construct an EntityLinker component.
|
"""Construct an EntityLinker component.
|
||||||
|
@ -70,19 +71,21 @@ def make_entity_linker(
|
||||||
model (Model[List[Doc], Floats2d]): A model that learns document vector
|
model (Model[List[Doc], Floats2d]): A model that learns document vector
|
||||||
representations. Given a batch of Doc objects, it should return a single
|
representations. Given a batch of Doc objects, it should return a single
|
||||||
array, with one row per item in the batch.
|
array, with one row per item in the batch.
|
||||||
kb (KnowledgeBase): The knowledge-base to link entities to.
|
|
||||||
labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
|
labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
|
||||||
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
||||||
incl_context (bool): Whether or not to include the local context in the model.
|
incl_context (bool): Whether or not to include the local context in the model.
|
||||||
|
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||||
|
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
|
||||||
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
"""
|
"""
|
||||||
return EntityLinker(
|
return EntityLinker(
|
||||||
nlp.vocab,
|
nlp.vocab,
|
||||||
model,
|
model,
|
||||||
name,
|
name,
|
||||||
kb_loader=kb_loader,
|
|
||||||
labels_discard=labels_discard,
|
labels_discard=labels_discard,
|
||||||
incl_prior=incl_prior,
|
incl_prior=incl_prior,
|
||||||
incl_context=incl_context,
|
incl_context=incl_context,
|
||||||
|
entity_vector_length=entity_vector_length,
|
||||||
get_candidates=get_candidates,
|
get_candidates=get_candidates,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -101,10 +104,10 @@ class EntityLinker(Pipe):
|
||||||
model: Model,
|
model: Model,
|
||||||
name: str = "entity_linker",
|
name: str = "entity_linker",
|
||||||
*,
|
*,
|
||||||
kb_loader: Callable[[Vocab], KnowledgeBase],
|
|
||||||
labels_discard: Iterable[str],
|
labels_discard: Iterable[str],
|
||||||
incl_prior: bool,
|
incl_prior: bool,
|
||||||
incl_context: bool,
|
incl_context: bool,
|
||||||
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize an entity linker.
|
"""Initialize an entity linker.
|
||||||
|
@ -113,10 +116,12 @@ class EntityLinker(Pipe):
|
||||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
|
|
||||||
labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
|
labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
|
||||||
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
||||||
incl_context (bool): Whether or not to include the local context in the model.
|
incl_context (bool): Whether or not to include the local context in the model.
|
||||||
|
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||||
|
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
|
||||||
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/entitylinker#init
|
DOCS: https://nightly.spacy.io/api/entitylinker#init
|
||||||
"""
|
"""
|
||||||
|
@ -127,15 +132,23 @@ class EntityLinker(Pipe):
|
||||||
"labels_discard": list(labels_discard),
|
"labels_discard": list(labels_discard),
|
||||||
"incl_prior": incl_prior,
|
"incl_prior": incl_prior,
|
||||||
"incl_context": incl_context,
|
"incl_context": incl_context,
|
||||||
|
"entity_vector_length": entity_vector_length,
|
||||||
}
|
}
|
||||||
self.kb = kb_loader(self.vocab)
|
|
||||||
self.get_candidates = get_candidates
|
self.get_candidates = get_candidates
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.distance = CosineDistance(normalize=False)
|
self.distance = CosineDistance(normalize=False)
|
||||||
# how many neightbour sentences to take into account
|
# how many neightbour sentences to take into account
|
||||||
self.n_sents = cfg.get("n_sents", 0)
|
self.n_sents = cfg.get("n_sents", 0)
|
||||||
|
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
||||||
|
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||||
|
|
||||||
def _require_kb(self) -> None:
|
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||||
|
"""Define the KB of this pipe by providing a function that will
|
||||||
|
create it using this object's vocab."""
|
||||||
|
self.kb = kb_loader(self.vocab)
|
||||||
|
self.cfg["entity_vector_length"] = self.kb.entity_vector_length
|
||||||
|
|
||||||
|
def validate_kb(self) -> None:
|
||||||
# Raise an error if the knowledge base is not initialized.
|
# Raise an error if the knowledge base is not initialized.
|
||||||
if len(self.kb) == 0:
|
if len(self.kb) == 0:
|
||||||
raise ValueError(Errors.E139.format(name=self.name))
|
raise ValueError(Errors.E139.format(name=self.name))
|
||||||
|
@ -145,6 +158,7 @@ class EntityLinker(Pipe):
|
||||||
get_examples: Callable[[], Iterable[Example]],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
nlp: Optional[Language] = None,
|
nlp: Optional[Language] = None,
|
||||||
|
kb_loader: Callable[[Vocab], KnowledgeBase] = None,
|
||||||
):
|
):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
@ -152,11 +166,16 @@ class EntityLinker(Pipe):
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
nlp (Language): The current nlp object the component is part of.
|
nlp (Language): The current nlp object the component is part of.
|
||||||
|
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
|
||||||
|
Note that providing this argument, will overwrite all data accumulated in the current KB.
|
||||||
|
Use this only when loading a KB as-such from file.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/entitylinker#initialize
|
DOCS: https://nightly.spacy.io/api/entitylinker#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
self._require_kb()
|
if kb_loader is not None:
|
||||||
|
self.set_kb(kb_loader)
|
||||||
|
self.validate_kb()
|
||||||
nO = self.kb.entity_vector_length
|
nO = self.kb.entity_vector_length
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
vector_sample = []
|
vector_sample = []
|
||||||
|
@ -192,7 +211,7 @@ class EntityLinker(Pipe):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/entitylinker#update
|
DOCS: https://nightly.spacy.io/api/entitylinker#update
|
||||||
"""
|
"""
|
||||||
self._require_kb()
|
self.validate_kb()
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
|
@ -303,7 +322,7 @@ class EntityLinker(Pipe):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/entitylinker#predict
|
DOCS: https://nightly.spacy.io/api/entitylinker#predict
|
||||||
"""
|
"""
|
||||||
self._require_kb()
|
self.validate_kb()
|
||||||
entity_count = 0
|
entity_count = 0
|
||||||
final_kb_ids = []
|
final_kb_ids = []
|
||||||
if not docs:
|
if not docs:
|
||||||
|
|
|
@ -201,10 +201,10 @@ class EntityRuler(Pipe):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/entityruler#initialize
|
DOCS: https://nightly.spacy.io/api/entityruler#initialize
|
||||||
"""
|
"""
|
||||||
|
self.clear()
|
||||||
if patterns:
|
if patterns:
|
||||||
self.add_patterns(patterns)
|
self.add_patterns(patterns)
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ent_ids(self) -> Tuple[str, ...]:
|
def ent_ids(self) -> Tuple[str, ...]:
|
||||||
"""All entity ids present in the match patterns `id` properties
|
"""All entity ids present in the match patterns `id` properties
|
||||||
|
|
|
@ -136,6 +136,16 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
assert doc.has_annotation("MORPH")
|
assert doc.has_annotation("MORPH")
|
||||||
|
|
||||||
|
|
||||||
|
def test_attributeruler_init_clear(nlp, pattern_dicts):
|
||||||
|
"""Test that initialization clears patterns."""
|
||||||
|
ruler = nlp.add_pipe("attribute_ruler")
|
||||||
|
assert not len(ruler.matcher)
|
||||||
|
ruler.add_patterns(pattern_dicts)
|
||||||
|
assert len(ruler.matcher)
|
||||||
|
ruler.initialize(lambda: [])
|
||||||
|
assert not len(ruler.matcher)
|
||||||
|
|
||||||
|
|
||||||
def test_attributeruler_score(nlp, pattern_dicts):
|
def test_attributeruler_score(nlp, pattern_dicts):
|
||||||
# initialize with patterns
|
# initialize with patterns
|
||||||
ruler = nlp.add_pipe("attribute_ruler")
|
ruler = nlp.add_pipe("attribute_ruler")
|
||||||
|
|
|
@ -110,7 +110,7 @@ def test_kb_invalid_entity_vector(nlp):
|
||||||
|
|
||||||
|
|
||||||
def test_kb_default(nlp):
|
def test_kb_default(nlp):
|
||||||
"""Test that the default (empty) KB is loaded when not providing a config"""
|
"""Test that the default (empty) KB is loaded upon construction"""
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config={})
|
entity_linker = nlp.add_pipe("entity_linker", config={})
|
||||||
assert len(entity_linker.kb) == 0
|
assert len(entity_linker.kb) == 0
|
||||||
assert entity_linker.kb.get_size_entities() == 0
|
assert entity_linker.kb.get_size_entities() == 0
|
||||||
|
@ -122,7 +122,7 @@ def test_kb_default(nlp):
|
||||||
def test_kb_custom_length(nlp):
|
def test_kb_custom_length(nlp):
|
||||||
"""Test that the default (empty) KB can be configured with a custom entity length"""
|
"""Test that the default (empty) KB can be configured with a custom entity length"""
|
||||||
entity_linker = nlp.add_pipe(
|
entity_linker = nlp.add_pipe(
|
||||||
"entity_linker", config={"kb_loader": {"entity_vector_length": 35}}
|
"entity_linker", config={"entity_vector_length": 35}
|
||||||
)
|
)
|
||||||
assert len(entity_linker.kb) == 0
|
assert len(entity_linker.kb) == 0
|
||||||
assert entity_linker.kb.get_size_entities() == 0
|
assert entity_linker.kb.get_size_entities() == 0
|
||||||
|
@ -130,18 +130,9 @@ def test_kb_custom_length(nlp):
|
||||||
assert entity_linker.kb.entity_vector_length == 35
|
assert entity_linker.kb.entity_vector_length == 35
|
||||||
|
|
||||||
|
|
||||||
def test_kb_undefined(nlp):
|
def test_kb_initialize_empty(nlp):
|
||||||
"""Test that the EL can't train without defining a KB"""
|
"""Test that the EL can't initialize without examples"""
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config={})
|
entity_linker = nlp.add_pipe("entity_linker")
|
||||||
with pytest.raises(ValueError):
|
|
||||||
entity_linker.initialize(lambda: [])
|
|
||||||
|
|
||||||
|
|
||||||
def test_kb_empty(nlp):
|
|
||||||
"""Test that the EL can't train with an empty KB"""
|
|
||||||
config = {"kb_loader": {"@misc": "spacy.EmptyKB.v1", "entity_vector_length": 342}}
|
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
|
||||||
assert len(entity_linker.kb) == 0
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
entity_linker.initialize(lambda: [])
|
entity_linker.initialize(lambda: [])
|
||||||
|
|
||||||
|
@ -201,8 +192,6 @@ def test_el_pipe_configuration(nlp):
|
||||||
ruler = nlp.add_pipe("entity_ruler")
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
ruler.add_patterns([pattern])
|
ruler.add_patterns([pattern])
|
||||||
|
|
||||||
@registry.misc.register("myAdamKB.v1")
|
|
||||||
def mykb() -> Callable[["Vocab"], KnowledgeBase]:
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||||
kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
||||||
|
@ -212,13 +201,12 @@ def test_el_pipe_configuration(nlp):
|
||||||
)
|
)
|
||||||
return kb
|
return kb
|
||||||
|
|
||||||
return create_kb
|
|
||||||
|
|
||||||
# run an EL pipe without a trained context encoder, to check the candidate generation step only
|
# run an EL pipe without a trained context encoder, to check the candidate generation step only
|
||||||
nlp.add_pipe(
|
entity_linker = nlp.add_pipe(
|
||||||
"entity_linker",
|
"entity_linker",
|
||||||
config={"kb_loader": {"@misc": "myAdamKB.v1"}, "incl_context": False},
|
config={"incl_context": False},
|
||||||
)
|
)
|
||||||
|
entity_linker.set_kb(create_kb)
|
||||||
# With the default get_candidates function, matching is case-sensitive
|
# With the default get_candidates function, matching is case-sensitive
|
||||||
text = "Douglas and douglas are not the same."
|
text = "Douglas and douglas are not the same."
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
|
@ -234,15 +222,15 @@ def test_el_pipe_configuration(nlp):
|
||||||
return get_lowercased_candidates
|
return get_lowercased_candidates
|
||||||
|
|
||||||
# replace the pipe with a new one with with a different candidate generator
|
# replace the pipe with a new one with with a different candidate generator
|
||||||
nlp.replace_pipe(
|
entity_linker = nlp.replace_pipe(
|
||||||
"entity_linker",
|
"entity_linker",
|
||||||
"entity_linker",
|
"entity_linker",
|
||||||
config={
|
config={
|
||||||
"kb_loader": {"@misc": "myAdamKB.v1"},
|
|
||||||
"incl_context": False,
|
"incl_context": False,
|
||||||
"get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"},
|
"get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
entity_linker.set_kb(create_kb)
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
assert doc[0].ent_kb_id_ == "Q2"
|
assert doc[0].ent_kb_id_ == "Q2"
|
||||||
assert doc[1].ent_kb_id_ == ""
|
assert doc[1].ent_kb_id_ == ""
|
||||||
|
@ -334,8 +322,6 @@ def test_preserving_links_asdoc(nlp):
|
||||||
"""Test that Span.as_doc preserves the existing entity links"""
|
"""Test that Span.as_doc preserves the existing entity links"""
|
||||||
vector_length = 1
|
vector_length = 1
|
||||||
|
|
||||||
@registry.misc.register("myLocationsKB.v1")
|
|
||||||
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
||||||
# adding entities
|
# adding entities
|
||||||
|
@ -346,8 +332,6 @@ def test_preserving_links_asdoc(nlp):
|
||||||
mykb.add_alias(alias="Denver", entities=["Q2"], probabilities=[0.6])
|
mykb.add_alias(alias="Denver", entities=["Q2"], probabilities=[0.6])
|
||||||
return mykb
|
return mykb
|
||||||
|
|
||||||
return create_kb
|
|
||||||
|
|
||||||
# set up pipeline with NER (Entity Ruler) and NEL (prior probability only, model not trained)
|
# set up pipeline with NER (Entity Ruler) and NEL (prior probability only, model not trained)
|
||||||
nlp.add_pipe("sentencizer")
|
nlp.add_pipe("sentencizer")
|
||||||
patterns = [
|
patterns = [
|
||||||
|
@ -356,8 +340,9 @@ def test_preserving_links_asdoc(nlp):
|
||||||
]
|
]
|
||||||
ruler = nlp.add_pipe("entity_ruler")
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
|
config = {"incl_prior": False}
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
|
entity_linker = nlp.add_pipe("entity_linker", config=config, last=True)
|
||||||
|
entity_linker.set_kb(create_kb)
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
assert entity_linker.model.get_dim("nO") == vector_length
|
assert entity_linker.model.get_dim("nO") == vector_length
|
||||||
|
|
||||||
|
@ -435,8 +420,6 @@ def test_overfitting_IO():
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
train_examples.append(Example.from_dict(doc, annotation))
|
train_examples.append(Example.from_dict(doc, annotation))
|
||||||
|
|
||||||
@registry.misc.register("myOverfittingKB.v1")
|
|
||||||
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create artificial KB - assign same prior weight to the two russ cochran's
|
# create artificial KB - assign same prior weight to the two russ cochran's
|
||||||
# Q2146908 (Russ Cochran): American golfer
|
# Q2146908 (Russ Cochran): American golfer
|
||||||
|
@ -451,14 +434,12 @@ def test_overfitting_IO():
|
||||||
)
|
)
|
||||||
return mykb
|
return mykb
|
||||||
|
|
||||||
return create_kb
|
|
||||||
|
|
||||||
# Create the Entity Linker component and add it to the pipeline
|
# Create the Entity Linker component and add it to the pipeline
|
||||||
entity_linker = nlp.add_pipe(
|
entity_linker = nlp.add_pipe(
|
||||||
"entity_linker",
|
"entity_linker",
|
||||||
config={"kb_loader": {"@misc": "myOverfittingKB.v1"}},
|
|
||||||
last=True,
|
last=True,
|
||||||
)
|
)
|
||||||
|
entity_linker.set_kb(create_kb)
|
||||||
|
|
||||||
# train the NEL pipe
|
# train the NEL pipe
|
||||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
|
@ -68,6 +68,15 @@ def test_entity_ruler_init_patterns(nlp, patterns):
|
||||||
assert doc.ents[1].label_ == "BYE"
|
assert doc.ents[1].label_ == "BYE"
|
||||||
|
|
||||||
|
|
||||||
|
def test_entity_ruler_init_clear(nlp, patterns):
|
||||||
|
"""Test that initialization clears patterns."""
|
||||||
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
assert len(ruler.labels) == 4
|
||||||
|
ruler.initialize(lambda: [])
|
||||||
|
assert len(ruler.labels) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_existing(nlp, patterns):
|
def test_entity_ruler_existing(nlp, patterns):
|
||||||
ruler = nlp.add_pipe("entity_ruler")
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
|
|
|
@ -71,17 +71,13 @@ def tagger():
|
||||||
def entity_linker():
|
def entity_linker():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
|
|
||||||
@registry.misc.register("TestIssue5230KB.v1")
|
|
||||||
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||||
kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
|
kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
|
||||||
return kb
|
return kb
|
||||||
|
|
||||||
return create_kb
|
entity_linker = nlp.add_pipe("entity_linker")
|
||||||
|
entity_linker.set_kb(create_kb)
|
||||||
config = {"kb_loader": {"@misc": "TestIssue5230KB.v1"}}
|
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
|
||||||
# need to add model for two reasons:
|
# need to add model for two reasons:
|
||||||
# 1. no model leads to error in serialization,
|
# 1. no model leads to error in serialization,
|
||||||
# 2. the affected line is the one for model serialization
|
# 2. the affected line is the one for model serialization
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
from typing import Callable
|
from typing import Callable
|
||||||
|
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.lang.en import English
|
from spacy.util import ensure_path, registry, load_model_from_config
|
||||||
from spacy.util import ensure_path, registry
|
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb import KnowledgeBase
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
from numpy import zeros
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_kb_disk(en_vocab):
|
def test_serialize_kb_disk(en_vocab):
|
||||||
|
@ -80,6 +81,28 @@ def _check_kb(kb):
|
||||||
def test_serialize_subclassed_kb():
|
def test_serialize_subclassed_kb():
|
||||||
"""Check that IO of a custom KB works fine as part of an EL pipe."""
|
"""Check that IO of a custom KB works fine as part of an EL pipe."""
|
||||||
|
|
||||||
|
config_string = """
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["entity_linker"]
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.entity_linker]
|
||||||
|
factory = "entity_linker"
|
||||||
|
|
||||||
|
[initialize]
|
||||||
|
|
||||||
|
[initialize.components]
|
||||||
|
|
||||||
|
[initialize.components.entity_linker]
|
||||||
|
|
||||||
|
[initialize.components.entity_linker.kb_loader]
|
||||||
|
@misc = "spacy.CustomKB.v1"
|
||||||
|
entity_vector_length = 342
|
||||||
|
custom_field = 666
|
||||||
|
"""
|
||||||
|
|
||||||
class SubKnowledgeBase(KnowledgeBase):
|
class SubKnowledgeBase(KnowledgeBase):
|
||||||
def __init__(self, vocab, entity_vector_length, custom_field):
|
def __init__(self, vocab, entity_vector_length, custom_field):
|
||||||
super().__init__(vocab, entity_vector_length)
|
super().__init__(vocab, entity_vector_length)
|
||||||
|
@ -90,23 +113,21 @@ def test_serialize_subclassed_kb():
|
||||||
entity_vector_length: int, custom_field: int
|
entity_vector_length: int, custom_field: int
|
||||||
) -> Callable[["Vocab"], KnowledgeBase]:
|
) -> Callable[["Vocab"], KnowledgeBase]:
|
||||||
def custom_kb_factory(vocab):
|
def custom_kb_factory(vocab):
|
||||||
return SubKnowledgeBase(
|
kb = SubKnowledgeBase(
|
||||||
vocab=vocab,
|
vocab=vocab,
|
||||||
entity_vector_length=entity_vector_length,
|
entity_vector_length=entity_vector_length,
|
||||||
custom_field=custom_field,
|
custom_field=custom_field,
|
||||||
)
|
)
|
||||||
|
kb.add_entity("random_entity", 0.0, zeros(entity_vector_length))
|
||||||
|
return kb
|
||||||
|
|
||||||
return custom_kb_factory
|
return custom_kb_factory
|
||||||
|
|
||||||
nlp = English()
|
config = Config().from_str(config_string)
|
||||||
config = {
|
nlp = load_model_from_config(config, auto_fill=True)
|
||||||
"kb_loader": {
|
nlp.initialize()
|
||||||
"@misc": "spacy.CustomKB.v1",
|
|
||||||
"entity_vector_length": 342,
|
entity_linker = nlp.get_pipe("entity_linker")
|
||||||
"custom_field": 666,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
|
||||||
assert type(entity_linker.kb) == SubKnowledgeBase
|
assert type(entity_linker.kb) == SubKnowledgeBase
|
||||||
assert entity_linker.kb.entity_vector_length == 342
|
assert entity_linker.kb.entity_vector_length == 342
|
||||||
assert entity_linker.kb.custom_field == 666
|
assert entity_linker.kb.custom_field == 666
|
||||||
|
@ -116,6 +137,7 @@ def test_serialize_subclassed_kb():
|
||||||
nlp.to_disk(tmp_dir)
|
nlp.to_disk(tmp_dir)
|
||||||
nlp2 = util.load_model_from_path(tmp_dir)
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
entity_linker2 = nlp2.get_pipe("entity_linker")
|
entity_linker2 = nlp2.get_pipe("entity_linker")
|
||||||
assert type(entity_linker2.kb) == SubKnowledgeBase
|
# After IO, the KB is the standard one
|
||||||
|
assert type(entity_linker2.kb) == KnowledgeBase
|
||||||
assert entity_linker2.kb.entity_vector_length == 342
|
assert entity_linker2.kb.entity_vector_length == 342
|
||||||
assert entity_linker2.kb.custom_field == 666
|
assert not hasattr(entity_linker2.kb, "custom_field")
|
||||||
|
|
|
@ -524,7 +524,7 @@ Get a pipeline component for a given component name.
|
||||||
|
|
||||||
## Language.replace_pipe {#replace_pipe tag="method" new="2"}
|
## Language.replace_pipe {#replace_pipe tag="method" new="2"}
|
||||||
|
|
||||||
Replace a component in the pipeline.
|
Replace a component in the pipeline and return the new component.
|
||||||
|
|
||||||
<Infobox title="Changed in v3.0" variant="warning">
|
<Infobox title="Changed in v3.0" variant="warning">
|
||||||
|
|
||||||
|
@ -538,7 +538,7 @@ and instead expects the **name of a component factory** registered using
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> nlp.replace_pipe("parser", my_custom_parser)
|
> new_parser = nlp.replace_pipe("parser", "my_custom_parser")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -548,6 +548,7 @@ and instead expects the **name of a component factory** registered using
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `config` <Tag variant="new">3</Tag> | Optional config parameters to use for the new component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
|
| `config` <Tag variant="new">3</Tag> | Optional config parameters to use for the new component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
|
||||||
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
||||||
|
| **RETURNS** | The new pipeline component. ~~Callable[[Doc], Doc]~~ |
|
||||||
|
|
||||||
## Language.rename_pipe {#rename_pipe tag="method" new="2"}
|
## Language.rename_pipe {#rename_pipe tag="method" new="2"}
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ api_string_name: transformer
|
||||||
> #### Installation
|
> #### Installation
|
||||||
>
|
>
|
||||||
> ```bash
|
> ```bash
|
||||||
> $ pip install spacy-transformers
|
> $ pip install -U %%SPACY_PKG_NAME[transformers] %%SPACY_PKG_FLAGS
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
<Infobox title="Important note" variant="warning">
|
<Infobox title="Important note" variant="warning">
|
||||||
|
@ -386,7 +386,7 @@ by this class. Instances of this class are typically assigned to the
|
||||||
[`Doc._.trf_data`](/api/transformer#custom-attributes) extension attribute.
|
[`Doc._.trf_data`](/api/transformer#custom-attributes) extension attribute.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ |
|
| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ |
|
||||||
| `tensors` | The activations for the `Doc` from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ |
|
| `tensors` | The activations for the `Doc` from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ |
|
||||||
| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
|
| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
|
||||||
|
@ -407,7 +407,7 @@ then be split to a list of [`TransformerData`](/api/transformer#transformerdata)
|
||||||
objects to associate the outputs to each [`Doc`](/api/doc) in the batch.
|
objects to associate the outputs to each [`Doc`](/api/doc) in the batch.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each `Span` can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each `Span` may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ |
|
| `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each `Span` can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each `Span` may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ |
|
||||||
| `tokens` | The output of the tokenizer. ~~transformers.BatchEncoding~~ |
|
| `tokens` | The output of the tokenizer. ~~transformers.BatchEncoding~~ |
|
||||||
| `tensors` | The output of the transformer model. ~~List[torch.Tensor]~~ |
|
| `tensors` | The output of the transformer model. ~~List[torch.Tensor]~~ |
|
||||||
|
|
|
@ -216,8 +216,7 @@ in `/opt/nvidia/cuda`, you would run:
|
||||||
```bash
|
```bash
|
||||||
### Installation with CUDA
|
### Installation with CUDA
|
||||||
$ export CUDA_PATH="/opt/nvidia/cuda"
|
$ export CUDA_PATH="/opt/nvidia/cuda"
|
||||||
$ pip install cupy-cuda102
|
$ pip install -U %%SPACY_PKG_NAME[cud102,transformers]%%SPACY_PKG_FLAGS
|
||||||
$ pip install spacy-transformers
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Runtime usage {#transformers-runtime}
|
### Runtime usage {#transformers-runtime}
|
||||||
|
|
|
@ -47,7 +47,7 @@ Before you install spaCy and its dependencies, make sure that your `pip`,
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ pip install -U pip setuptools wheel
|
$ pip install -U pip setuptools wheel
|
||||||
$ pip install -U spacy
|
$ pip install -U %%SPACY_PKG_NAME%%SPACY_PKG_FLAGS
|
||||||
```
|
```
|
||||||
|
|
||||||
When using pip it is generally recommended to install packages in a virtual
|
When using pip it is generally recommended to install packages in a virtual
|
||||||
|
@ -57,7 +57,7 @@ environment to avoid modifying system state:
|
||||||
$ python -m venv .env
|
$ python -m venv .env
|
||||||
$ source .env/bin/activate
|
$ source .env/bin/activate
|
||||||
$ pip install -U pip setuptools wheel
|
$ pip install -U pip setuptools wheel
|
||||||
$ pip install spacy
|
$ pip install -U %%SPACY_PKG_NAME%%SPACY_PKG_FLAGS
|
||||||
```
|
```
|
||||||
|
|
||||||
spaCy also lets you install extra dependencies by specifying the following
|
spaCy also lets you install extra dependencies by specifying the following
|
||||||
|
@ -68,15 +68,16 @@ spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```bash
|
> ```bash
|
||||||
> $ pip install spacy[lookups,transformers]
|
> $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `lookups` | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
|
| `lookups` | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
|
||||||
| `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. |
|
| `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. |
|
||||||
|
| `ray` | Install [`spacy-ray`](https://github.com/explosion/spacy-ray) to add CLI commands for [parallel training](/usage/training#parallel-training). |
|
||||||
| `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. |
|
| `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. |
|
||||||
| `ja`, `ko`, `th` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages). |
|
| `ja`, `ko`, `th`, `zh` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages). |
|
||||||
|
|
||||||
### conda {#conda}
|
### conda {#conda}
|
||||||
|
|
||||||
|
@ -88,8 +89,8 @@ $ conda install -c conda-forge spacy
|
||||||
```
|
```
|
||||||
|
|
||||||
For the feedstock including the build recipe and configuration, check out
|
For the feedstock including the build recipe and configuration, check out
|
||||||
[this repository](https://github.com/conda-forge/spacy-feedstock). Improvements
|
[this repository](https://github.com/conda-forge/spacy-feedstock). Note that we
|
||||||
and pull requests to the recipe and setup are always appreciated.
|
currently don't publish any [pre-releases](#changelog-pre) on conda.
|
||||||
|
|
||||||
### Upgrading spaCy {#upgrading}
|
### Upgrading spaCy {#upgrading}
|
||||||
|
|
||||||
|
@ -116,7 +117,7 @@ are printed. It's recommended to run the command with `python -m` to make sure
|
||||||
you're executing the correct version of spaCy.
|
you're executing the correct version of spaCy.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ pip install -U spacy
|
$ pip install -U %%SPACY_PKG_NAME%%SPACY_PKG_FLAGS
|
||||||
$ python -m spacy validate
|
$ python -m spacy validate
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -134,7 +135,7 @@ specifier allows cupy to be installed via wheel, saving some compilation time.
|
||||||
The specifiers should install [`cupy`](https://cupy.chainer.org).
|
The specifiers should install [`cupy`](https://cupy.chainer.org).
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ pip install -U spacy[cuda92]
|
$ pip install -U %%SPACY_PKG_NAME[cuda92]%%SPACY_PKG_FLAGS
|
||||||
```
|
```
|
||||||
|
|
||||||
Once you have a GPU-enabled installation, the best way to activate it is to call
|
Once you have a GPU-enabled installation, the best way to activate it is to call
|
||||||
|
|
|
@ -166,7 +166,7 @@ lookup lemmatizer looks up the token surface form in the lookup table without
|
||||||
reference to the token's part-of-speech or context.
|
reference to the token's part-of-speech or context.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# pip install spacy-lookups-data
|
# pip install -U %%SPACY_PKG_NAME[lookups]%%SPACY_PKG_FLAGS
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
nlp = spacy.blank("sv")
|
nlp = spacy.blank("sv")
|
||||||
|
@ -181,7 +181,7 @@ rule-based lemmatizer can be added using rule tables from
|
||||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data):
|
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# pip install spacy-lookups-data
|
# pip install -U %%SPACY_PKG_NAME[lookups]%%SPACY_PKG_FLAGS
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
nlp = spacy.blank("de")
|
nlp = spacy.blank("de")
|
||||||
|
@ -1801,7 +1801,10 @@ print(doc2[5].tag_, doc2[5].pos_) # WP PRON
|
||||||
|
|
||||||
<Infobox variant="warning" title="Migrating from spaCy v2.x">
|
<Infobox variant="warning" title="Migrating from spaCy v2.x">
|
||||||
|
|
||||||
The [`AttributeRuler`](/api/attributeruler) can import a **tag map and morph rules** in the v2.x format via its built-in methods or when the component is initialized before training. See the [migration guide](/usage/v3#migrating-training-mappings-exceptions) for details.
|
The [`AttributeRuler`](/api/attributeruler) can import a **tag map and morph
|
||||||
|
rules** in the v2.x format via its built-in methods or when the component is
|
||||||
|
initialized before training. See the
|
||||||
|
[migration guide](/usage/v3#migrating-training-mappings-exceptions) for details.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
|
|
@ -54,7 +54,7 @@ contribute to development.
|
||||||
> separately in the same environment:
|
> separately in the same environment:
|
||||||
>
|
>
|
||||||
> ```bash
|
> ```bash
|
||||||
> $ pip install spacy[lookups]
|
> $ pip install -U %%SPACY_PKG_NAME[lookups]%%SPACY_PKG_FLAGS
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
import Languages from 'widgets/languages.js'
|
import Languages from 'widgets/languages.js'
|
||||||
|
@ -287,7 +287,7 @@ The download command will [install the package](/usage/models#download-pip) via
|
||||||
pip and place the package in your `site-packages` directory.
|
pip and place the package in your `site-packages` directory.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ pip install -U spacy
|
$ pip install -U %%SPACY_PKG_NAME%%SPACY_PKG_FLAGS
|
||||||
$ python -m spacy download en_core_web_sm
|
$ python -m spacy download en_core_web_sm
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -813,7 +813,7 @@ full embedded visualizer, as well as individual components.
|
||||||
> #### Installation
|
> #### Installation
|
||||||
>
|
>
|
||||||
> ```bash
|
> ```bash
|
||||||
> $ pip install "spacy-streamlit>=1.0.0a0"
|
> $ pip install spacy-streamlit --pre
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
![](../images/spacy-streamlit.png)
|
![](../images/spacy-streamlit.png)
|
||||||
|
@ -911,7 +911,7 @@ https://github.com/explosion/projects/blob/v3/integrations/fastapi/scripts/main.
|
||||||
> #### Installation
|
> #### Installation
|
||||||
>
|
>
|
||||||
> ```cli
|
> ```cli
|
||||||
> $ pip install spacy-ray
|
> $ pip install -U %%SPACY_PKG_NAME[ray]%%SPACY_PKG_FLAGS
|
||||||
> # Check that the CLI is registered
|
> # Check that the CLI is registered
|
||||||
> $ python -m spacy ray --help
|
> $ python -m spacy ray --help
|
||||||
> ```
|
> ```
|
||||||
|
|
|
@ -297,7 +297,7 @@ packages. This lets one application easily customize the behavior of another, by
|
||||||
exposing an entry point in its `setup.py`. For a quick and fun intro to entry
|
exposing an entry point in its `setup.py`. For a quick and fun intro to entry
|
||||||
points in Python, check out
|
points in Python, check out
|
||||||
[this excellent blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/).
|
[this excellent blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/).
|
||||||
spaCy can load custom function from several different entry points to add
|
spaCy can load custom functions from several different entry points to add
|
||||||
pipeline component factories, language classes and other settings. To make spaCy
|
pipeline component factories, language classes and other settings. To make spaCy
|
||||||
use your entry points, your package needs to expose them and it needs to be
|
use your entry points, your package needs to expose them and it needs to be
|
||||||
installed in the same environment – that's it.
|
installed in the same environment – that's it.
|
||||||
|
|
|
@ -1249,7 +1249,7 @@ valid.
|
||||||
> #### Installation
|
> #### Installation
|
||||||
>
|
>
|
||||||
> ```cli
|
> ```cli
|
||||||
> $ pip install spacy-ray
|
> $ pip install -U %%SPACY_PKG_NAME[ray]%%SPACY_PKG_FLAGS
|
||||||
> # Check that the CLI is registered
|
> # Check that the CLI is registered
|
||||||
> $ python -m spacy ray --help
|
> $ python -m spacy ray --help
|
||||||
> ```
|
> ```
|
||||||
|
|
|
@ -236,7 +236,7 @@ treebank.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```cli
|
> ```cli
|
||||||
> $ pip install spacy-ray
|
> $ pip install -U %%SPACY_PKG_NAME[ray]%%SPACY_PKG_FLAGS
|
||||||
> # Check that the CLI is registered
|
> # Check that the CLI is registered
|
||||||
> $ python -m spacy ray --help
|
> $ python -m spacy ray --help
|
||||||
> # Train a pipeline
|
> # Train a pipeline
|
||||||
|
@ -272,7 +272,7 @@ add to your pipeline and customize for your use case:
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # pip install spacy-lookups-data
|
> # pip install -U %%SPACY_PKG_NAME[lookups]%%SPACY_PKG_FLAGS
|
||||||
> nlp = spacy.blank("en")
|
> nlp = spacy.blank("en")
|
||||||
> nlp.add_pipe("lemmatizer")
|
> nlp.add_pipe("lemmatizer")
|
||||||
> ```
|
> ```
|
||||||
|
@ -395,7 +395,7 @@ type-check model definitions.
|
||||||
For data validation, spaCy v3.0 adopts
|
For data validation, spaCy v3.0 adopts
|
||||||
[`pydantic`](https://github.com/samuelcolvin/pydantic). It also powers the data
|
[`pydantic`](https://github.com/samuelcolvin/pydantic). It also powers the data
|
||||||
validation of Thinc's [config system](https://thinc.ai/docs/usage-config), which
|
validation of Thinc's [config system](https://thinc.ai/docs/usage-config), which
|
||||||
lets you to register **custom functions with typed arguments**, reference them
|
lets you register **custom functions with typed arguments**, reference them
|
||||||
in your config and see validation errors if the argument values don't match.
|
in your config and see validation errors if the argument values don't match.
|
||||||
|
|
||||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
|
@ -30,6 +30,8 @@ const branch = isNightly ? 'develop' : 'master'
|
||||||
const replacements = {
|
const replacements = {
|
||||||
GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`,
|
GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`,
|
||||||
GITHUB_PROJECTS: `https://github.com/${site.projectsRepo}`,
|
GITHUB_PROJECTS: `https://github.com/${site.projectsRepo}`,
|
||||||
|
SPACY_PKG_NAME: isNightly ? 'spacy-nightly' : 'spacy',
|
||||||
|
SPACY_PKG_FLAGS: isNightly ? ' --pre' : '',
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -97,7 +97,10 @@ const Changelog = () => {
|
||||||
<p>
|
<p>
|
||||||
Pre-releases include alpha and beta versions, as well as release candidates. They
|
Pre-releases include alpha and beta versions, as well as release candidates. They
|
||||||
are not intended for production use. You can download spaCy pre-releases via the{' '}
|
are not intended for production use. You can download spaCy pre-releases via the{' '}
|
||||||
<InlineCode>spacy-nightly</InlineCode> package on pip.
|
<Link to="https://pypi.org/packages/spacy-nightly">
|
||||||
|
<InlineCode>spacy-nightly</InlineCode>
|
||||||
|
</Link>{' '}
|
||||||
|
package on pip.
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
|
|
|
@ -28,7 +28,8 @@ import irlBackground from '../images/spacy-irl.jpg'
|
||||||
|
|
||||||
import Benchmarks from 'usage/_benchmarks-models.md'
|
import Benchmarks from 'usage/_benchmarks-models.md'
|
||||||
|
|
||||||
const CODE_EXAMPLE = `# pip install spacy
|
function getCodeExample(nightly) {
|
||||||
|
return `# pip install -U ${nightly ? 'spacy-nightly --pre' : 'spacy'}
|
||||||
# python -m spacy download en_core_web_sm
|
# python -m spacy download en_core_web_sm
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
|
@ -52,9 +53,11 @@ print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])
|
||||||
for entity in doc.ents:
|
for entity in doc.ents:
|
||||||
print(entity.text, entity.label_)
|
print(entity.text, entity.label_)
|
||||||
`
|
`
|
||||||
|
}
|
||||||
|
|
||||||
const Landing = ({ data }) => {
|
const Landing = ({ data }) => {
|
||||||
const { counts } = data
|
const { counts, nightly } = data
|
||||||
|
const codeExample = getCodeExample(nightly)
|
||||||
return (
|
return (
|
||||||
<>
|
<>
|
||||||
<LandingHeader nightly={data.nightly}>
|
<LandingHeader nightly={data.nightly}>
|
||||||
|
@ -91,7 +94,7 @@ const Landing = ({ data }) => {
|
||||||
</LandingGrid>
|
</LandingGrid>
|
||||||
|
|
||||||
<LandingGrid>
|
<LandingGrid>
|
||||||
<LandingDemo title="Edit the code & try spaCy">{CODE_EXAMPLE}</LandingDemo>
|
<LandingDemo title="Edit the code & try spaCy">{codeExample}</LandingDemo>
|
||||||
|
|
||||||
<LandingCol>
|
<LandingCol>
|
||||||
<H2>Features</H2>
|
<H2>Features</H2>
|
||||||
|
|
|
@ -141,6 +141,11 @@ const QuickstartInstall = ({ id, title }) => {
|
||||||
setters={setters}
|
setters={setters}
|
||||||
showDropdown={showDropdown}
|
showDropdown={showDropdown}
|
||||||
>
|
>
|
||||||
|
{nightly && (
|
||||||
|
<QS package="conda" comment prompt={false}>
|
||||||
|
# 🚨 Nightly releases are currently only available via pip
|
||||||
|
</QS>
|
||||||
|
)}
|
||||||
<QS config="venv">python -m venv .env</QS>
|
<QS config="venv">python -m venv .env</QS>
|
||||||
<QS config="venv" os="mac">
|
<QS config="venv" os="mac">
|
||||||
source .env/bin/activate
|
source .env/bin/activate
|
||||||
|
@ -175,9 +180,9 @@ const QuickstartInstall = ({ id, title }) => {
|
||||||
</QS>
|
</QS>
|
||||||
<QS package="source">pip install -r requirements.txt</QS>
|
<QS package="source">pip install -r requirements.txt</QS>
|
||||||
<QS package="source">python setup.py build_ext --inplace</QS>
|
<QS package="source">python setup.py build_ext --inplace</QS>
|
||||||
<QS package="source" config="train">
|
{(train || hardware == 'gpu') && (
|
||||||
pip install -e '.[{pipExtras}]'
|
<QS package="source">pip install -e '.[{pipExtras}]'</QS>
|
||||||
</QS>
|
)}
|
||||||
|
|
||||||
<QS config="train" package="conda">
|
<QS config="train" package="conda">
|
||||||
conda install -c conda-forge spacy-transformers
|
conda install -c conda-forge spacy-transformers
|
||||||
|
|
Loading…
Reference in New Issue
Block a user