mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 01:32:32 +03:00
Merge branch 'refactor/move-registrations' into kamikaze-cython3-upd
This commit is contained in:
commit
906bf04239
|
@ -117,7 +117,7 @@ For detailed installation instructions, see the
|
||||||
|
|
||||||
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
|
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
|
||||||
Studio)
|
Studio)
|
||||||
- **Python version**: Python >=3.7, <=3.12 (only 64 bit)
|
- **Python version**: Python >=3.7, <3.13 (only 64 bit)
|
||||||
- **Package managers**: [pip] · [conda] (via `conda-forge`)
|
- **Package managers**: [pip] · [conda] (via `conda-forge`)
|
||||||
|
|
||||||
[pip]: https://pypi.org/project/spacy/
|
[pip]: https://pypi.org/project/spacy/
|
||||||
|
|
|
@ -30,7 +30,7 @@ project_urls =
|
||||||
[options]
|
[options]
|
||||||
zip_safe = false
|
zip_safe = false
|
||||||
include_package_data = true
|
include_package_data = true
|
||||||
python_requires = >=3.9,<3.14
|
python_requires = >=3.9,<3.13
|
||||||
# NOTE: This section is superseded by pyproject.toml and will be removed in
|
# NOTE: This section is superseded by pyproject.toml and will be removed in
|
||||||
# spaCy v4
|
# spaCy v4
|
||||||
setup_requires =
|
setup_requires =
|
||||||
|
|
|
@ -19,6 +19,7 @@ from .glossary import explain # noqa: F401
|
||||||
from .language import Language
|
from .language import Language
|
||||||
from .util import logger, registry # noqa: F401
|
from .util import logger, registry # noqa: F401
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
|
from .registrations import populate_registry, REGISTRY_POPULATED
|
||||||
|
|
||||||
if sys.maxunicode == 65535:
|
if sys.maxunicode == 65535:
|
||||||
raise SystemError(Errors.E130)
|
raise SystemError(Errors.E130)
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.8.5"
|
__version__ = "3.8.6"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -29,7 +29,6 @@ from ..featureextractor import FeatureExtractor
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.Tok2VecListener.v1")
|
|
||||||
def tok2vec_listener_v1(width: int, upstream: str = "*"):
|
def tok2vec_listener_v1(width: int, upstream: str = "*"):
|
||||||
tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
|
tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
|
||||||
return tok2vec
|
return tok2vec
|
||||||
|
@ -46,7 +45,6 @@ def get_tok2vec_width(model: Model):
|
||||||
return nO
|
return nO
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.HashEmbedCNN.v2")
|
|
||||||
def build_hash_embed_cnn_tok2vec(
|
def build_hash_embed_cnn_tok2vec(
|
||||||
*,
|
*,
|
||||||
width: int,
|
width: int,
|
||||||
|
@ -102,7 +100,6 @@ def build_hash_embed_cnn_tok2vec(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.Tok2Vec.v2")
|
|
||||||
def build_Tok2Vec_model(
|
def build_Tok2Vec_model(
|
||||||
embed: Model[List[Doc], List[Floats2d]],
|
embed: Model[List[Doc], List[Floats2d]],
|
||||||
encode: Model[List[Floats2d], List[Floats2d]],
|
encode: Model[List[Floats2d], List[Floats2d]],
|
||||||
|
@ -123,7 +120,6 @@ def build_Tok2Vec_model(
|
||||||
return tok2vec
|
return tok2vec
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.MultiHashEmbed.v2")
|
|
||||||
def MultiHashEmbed(
|
def MultiHashEmbed(
|
||||||
width: int,
|
width: int,
|
||||||
attrs: List[Union[str, int]],
|
attrs: List[Union[str, int]],
|
||||||
|
@ -201,7 +197,6 @@ def MultiHashEmbed(
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.CharacterEmbed.v2")
|
|
||||||
def CharacterEmbed(
|
def CharacterEmbed(
|
||||||
width: int,
|
width: int,
|
||||||
rows: int,
|
rows: int,
|
||||||
|
@ -278,7 +273,6 @@ def CharacterEmbed(
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.MaxoutWindowEncoder.v2")
|
|
||||||
def MaxoutWindowEncoder(
|
def MaxoutWindowEncoder(
|
||||||
width: int, window_size: int, maxout_pieces: int, depth: int
|
width: int, window_size: int, maxout_pieces: int, depth: int
|
||||||
) -> Model[List[Floats2d], List[Floats2d]]:
|
) -> Model[List[Floats2d], List[Floats2d]]:
|
||||||
|
@ -310,7 +304,6 @@ def MaxoutWindowEncoder(
|
||||||
return with_array(model, pad=receptive_field)
|
return with_array(model, pad=receptive_field)
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.MishWindowEncoder.v2")
|
|
||||||
def MishWindowEncoder(
|
def MishWindowEncoder(
|
||||||
width: int, window_size: int, depth: int
|
width: int, window_size: int, depth: int
|
||||||
) -> Model[List[Floats2d], List[Floats2d]]:
|
) -> Model[List[Floats2d], List[Floats2d]]:
|
||||||
|
@ -333,7 +326,6 @@ def MishWindowEncoder(
|
||||||
return with_array(model)
|
return with_array(model)
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.TorchBiLSTMEncoder.v1")
|
|
||||||
def BiLSTMEncoder(
|
def BiLSTMEncoder(
|
||||||
width: int, depth: int, dropout: float
|
width: int, depth: int, dropout: float
|
||||||
) -> Model[List[Floats2d], List[Floats2d]]:
|
) -> Model[List[Floats2d], List[Floats2d]]:
|
||||||
|
|
|
@ -22,13 +22,6 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
|
||||||
MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
|
MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"attribute_ruler",
|
|
||||||
default_config={
|
|
||||||
"validate": False,
|
|
||||||
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
|
|
||||||
},
|
|
||||||
)
|
|
||||||
def make_attribute_ruler(
|
def make_attribute_ruler(
|
||||||
nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
|
nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
|
||||||
):
|
):
|
||||||
|
|
|
@ -39,26 +39,6 @@ subword_features = true
|
||||||
DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"parser",
|
|
||||||
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
|
||||||
default_config={
|
|
||||||
"moves": None,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"model": DEFAULT_PARSER_MODEL,
|
|
||||||
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"dep_uas": 0.5,
|
|
||||||
"dep_las": 0.5,
|
|
||||||
"dep_las_per_type": None,
|
|
||||||
"sents_p": None,
|
|
||||||
"sents_r": None,
|
|
||||||
"sents_f": 0.0,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
def make_parser(
|
def make_parser(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
|
@ -125,29 +105,6 @@ def make_parser(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"beam_parser",
|
|
||||||
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
|
||||||
default_config={
|
|
||||||
"beam_width": 8,
|
|
||||||
"beam_density": 0.01,
|
|
||||||
"beam_update_prob": 0.5,
|
|
||||||
"moves": None,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"model": DEFAULT_PARSER_MODEL,
|
|
||||||
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"dep_uas": 0.5,
|
|
||||||
"dep_las": 0.5,
|
|
||||||
"dep_las_per_type": None,
|
|
||||||
"sents_p": None,
|
|
||||||
"sents_r": None,
|
|
||||||
"sents_f": 0.0,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
def make_beam_parser(
|
def make_beam_parser(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
|
|
|
@ -39,20 +39,6 @@ subword_features = true
|
||||||
DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"trainable_lemmatizer",
|
|
||||||
assigns=["token.lemma"],
|
|
||||||
requires=[],
|
|
||||||
default_config={
|
|
||||||
"model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL,
|
|
||||||
"backoff": "orth",
|
|
||||||
"min_tree_freq": 3,
|
|
||||||
"overwrite": False,
|
|
||||||
"top_k": 1,
|
|
||||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
|
||||||
)
|
|
||||||
def make_edit_tree_lemmatizer(
|
def make_edit_tree_lemmatizer(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
|
|
|
@ -40,32 +40,6 @@ subword_features = true
|
||||||
DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"entity_linker",
|
|
||||||
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
|
||||||
assigns=["token.ent_kb_id"],
|
|
||||||
default_config={
|
|
||||||
"model": DEFAULT_NEL_MODEL,
|
|
||||||
"labels_discard": [],
|
|
||||||
"n_sents": 0,
|
|
||||||
"incl_prior": True,
|
|
||||||
"incl_context": True,
|
|
||||||
"entity_vector_length": 64,
|
|
||||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
|
||||||
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
|
||||||
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
|
|
||||||
"overwrite": True,
|
|
||||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
|
||||||
"use_gold_ents": True,
|
|
||||||
"candidates_batch_size": 1,
|
|
||||||
"threshold": None,
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"nel_micro_f": 1.0,
|
|
||||||
"nel_micro_r": None,
|
|
||||||
"nel_micro_p": None,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
def make_entity_linker(
|
def make_entity_linker(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
|
|
|
@ -19,24 +19,6 @@ DEFAULT_ENT_ID_SEP = "||"
|
||||||
PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"entity_ruler",
|
|
||||||
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
|
|
||||||
default_config={
|
|
||||||
"phrase_matcher_attr": None,
|
|
||||||
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
|
||||||
"validate": False,
|
|
||||||
"overwrite_ents": False,
|
|
||||||
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
|
||||||
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"ents_f": 1.0,
|
|
||||||
"ents_p": 0.0,
|
|
||||||
"ents_r": 0.0,
|
|
||||||
"ents_per_type": None,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
def make_entity_ruler(
|
def make_entity_ruler(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
|
@ -63,7 +45,6 @@ def entity_ruler_score(examples, **kwargs):
|
||||||
return get_ner_prf(examples)
|
return get_ner_prf(examples)
|
||||||
|
|
||||||
|
|
||||||
@registry.scorers("spacy.entity_ruler_scorer.v1")
|
|
||||||
def make_entity_ruler_scorer():
|
def make_entity_ruler_scorer():
|
||||||
return entity_ruler_score
|
return entity_ruler_score
|
||||||
|
|
||||||
|
|
|
@ -73,11 +73,6 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"token_splitter",
|
|
||||||
default_config={"min_length": 25, "split_length": 10},
|
|
||||||
retokenizes=True,
|
|
||||||
)
|
|
||||||
def make_token_splitter(
|
def make_token_splitter(
|
||||||
nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0
|
nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0
|
||||||
):
|
):
|
||||||
|
@ -141,10 +136,6 @@ class TokenSplitter:
|
||||||
util.from_disk(path, serializers, [])
|
util.from_disk(path, serializers, [])
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"doc_cleaner",
|
|
||||||
default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True},
|
|
||||||
)
|
|
||||||
def make_doc_cleaner(nlp: Language, name: str, *, attrs: Dict[str, Any], silent: bool):
|
def make_doc_cleaner(nlp: Language, name: str, *, attrs: Dict[str, Any], silent: bool):
|
||||||
return DocCleaner(attrs, silent=silent)
|
return DocCleaner(attrs, silent=silent)
|
||||||
|
|
||||||
|
|
|
@ -16,17 +16,6 @@ from ..vocab import Vocab
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"lemmatizer",
|
|
||||||
assigns=["token.lemma"],
|
|
||||||
default_config={
|
|
||||||
"model": None,
|
|
||||||
"mode": "lookup",
|
|
||||||
"overwrite": False,
|
|
||||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
|
||||||
)
|
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
model: Optional[Model],
|
model: Optional[Model],
|
||||||
|
@ -44,7 +33,6 @@ def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
||||||
|
|
||||||
|
|
||||||
@registry.scorers("spacy.lemmatizer_scorer.v1")
|
|
||||||
def make_lemmatizer_scorer():
|
def make_lemmatizer_scorer():
|
||||||
return lemmatizer_score
|
return lemmatizer_score
|
||||||
|
|
||||||
|
|
|
@ -47,13 +47,6 @@ maxout_pieces = 3
|
||||||
DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"morphologizer",
|
|
||||||
assigns=["token.morph", "token.pos"],
|
|
||||||
default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False,
|
|
||||||
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "label_smoothing": 0.0},
|
|
||||||
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
|
||||||
)
|
|
||||||
def make_morphologizer(
|
def make_morphologizer(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
model: Model,
|
model: Model,
|
||||||
|
|
|
@ -30,10 +30,6 @@ subword_features = true
|
||||||
DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"nn_labeller",
|
|
||||||
default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL}
|
|
||||||
)
|
|
||||||
def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str):
|
def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str):
|
||||||
return MultitaskObjective(nlp.vocab, model, name)
|
return MultitaskObjective(nlp.vocab, model, name)
|
||||||
|
|
||||||
|
|
|
@ -36,19 +36,6 @@ subword_features = true
|
||||||
DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"ner",
|
|
||||||
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
|
||||||
default_config={
|
|
||||||
"moves": None,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
"model": DEFAULT_NER_MODEL,
|
|
||||||
"incorrect_spans_key": None,
|
|
||||||
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
|
||||||
|
|
||||||
)
|
|
||||||
def make_ner(
|
def make_ner(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
|
@ -101,21 +88,6 @@ def make_ner(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"beam_ner",
|
|
||||||
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
|
||||||
default_config={
|
|
||||||
"moves": None,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
"model": DEFAULT_NER_MODEL,
|
|
||||||
"beam_density": 0.01,
|
|
||||||
"beam_update_prob": 0.5,
|
|
||||||
"beam_width": 32,
|
|
||||||
"incorrect_spans_key": None,
|
|
||||||
"scorer": None,
|
|
||||||
},
|
|
||||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
|
||||||
)
|
|
||||||
def make_beam_ner(
|
def make_beam_ner(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
|
@ -183,7 +155,6 @@ def ner_score(examples, **kwargs):
|
||||||
return get_ner_prf(examples, **kwargs)
|
return get_ner_prf(examples, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
@registry.scorers("spacy.ner_scorer.v1")
|
|
||||||
def make_ner_scorer():
|
def make_ner_scorer():
|
||||||
return ner_score
|
return ner_score
|
||||||
|
|
||||||
|
|
|
@ -14,12 +14,6 @@ from .senter import senter_score
|
||||||
BACKWARD_OVERWRITE = False
|
BACKWARD_OVERWRITE = False
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"sentencizer",
|
|
||||||
assigns=["token.is_sent_start", "doc.sents"],
|
|
||||||
default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
|
|
||||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
|
||||||
)
|
|
||||||
def make_sentencizer(
|
def make_sentencizer(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
|
|
|
@ -34,12 +34,6 @@ subword_features = true
|
||||||
DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"senter",
|
|
||||||
assigns=["token.is_sent_start"],
|
|
||||||
default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
|
|
||||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
|
||||||
)
|
|
||||||
def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
|
def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
|
||||||
return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
|
return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
|
||||||
|
|
||||||
|
@ -53,7 +47,6 @@ def senter_score(examples, **kwargs):
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
@registry.scorers("spacy.senter_scorer.v1")
|
|
||||||
def make_senter_scorer():
|
def make_senter_scorer():
|
||||||
return senter_score
|
return senter_score
|
||||||
|
|
||||||
|
|
|
@ -41,23 +41,6 @@ depth = 4
|
||||||
DEFAULT_SPAN_FINDER_MODEL = Config().from_str(span_finder_default_config)["model"]
|
DEFAULT_SPAN_FINDER_MODEL = Config().from_str(span_finder_default_config)["model"]
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"span_finder",
|
|
||||||
assigns=["doc.spans"],
|
|
||||||
default_config={
|
|
||||||
"threshold": 0.5,
|
|
||||||
"model": DEFAULT_SPAN_FINDER_MODEL,
|
|
||||||
"spans_key": DEFAULT_SPANS_KEY,
|
|
||||||
"max_length": 25,
|
|
||||||
"min_length": None,
|
|
||||||
"scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
f"spans_{DEFAULT_SPANS_KEY}_f": 1.0,
|
|
||||||
f"spans_{DEFAULT_SPANS_KEY}_p": 0.0,
|
|
||||||
f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
def make_span_finder(
|
def make_span_finder(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
|
@ -97,7 +80,6 @@ def make_span_finder(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@registry.scorers("spacy.span_finder_scorer.v1")
|
|
||||||
def make_span_finder_scorer():
|
def make_span_finder_scorer():
|
||||||
return span_finder_score
|
return span_finder_score
|
||||||
|
|
||||||
|
|
|
@ -32,24 +32,6 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
||||||
DEFAULT_SPANS_KEY = "ruler"
|
DEFAULT_SPANS_KEY = "ruler"
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"future_entity_ruler",
|
|
||||||
assigns=["doc.ents"],
|
|
||||||
default_config={
|
|
||||||
"phrase_matcher_attr": None,
|
|
||||||
"validate": False,
|
|
||||||
"overwrite_ents": False,
|
|
||||||
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
|
||||||
"ent_id_sep": "__unused__",
|
|
||||||
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"ents_f": 1.0,
|
|
||||||
"ents_p": 0.0,
|
|
||||||
"ents_r": 0.0,
|
|
||||||
"ents_per_type": None,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
def make_entity_ruler(
|
def make_entity_ruler(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
|
@ -79,30 +61,6 @@ def make_entity_ruler(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"span_ruler",
|
|
||||||
assigns=["doc.spans"],
|
|
||||||
default_config={
|
|
||||||
"spans_key": DEFAULT_SPANS_KEY,
|
|
||||||
"spans_filter": None,
|
|
||||||
"annotate_ents": False,
|
|
||||||
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
|
|
||||||
"phrase_matcher_attr": None,
|
|
||||||
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
|
||||||
"validate": False,
|
|
||||||
"overwrite": True,
|
|
||||||
"scorer": {
|
|
||||||
"@scorers": "spacy.overlapping_labeled_spans_scorer.v1",
|
|
||||||
"spans_key": DEFAULT_SPANS_KEY,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
f"spans_{DEFAULT_SPANS_KEY}_f": 1.0,
|
|
||||||
f"spans_{DEFAULT_SPANS_KEY}_p": 0.0,
|
|
||||||
f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
|
|
||||||
f"spans_{DEFAULT_SPANS_KEY}_per_type": None,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
def make_span_ruler(
|
def make_span_ruler(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
|
|
|
@ -134,7 +134,6 @@ def preset_spans_suggester(
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.ngram_suggester.v1")
|
|
||||||
def build_ngram_suggester(sizes: List[int]) -> Suggester:
|
def build_ngram_suggester(sizes: List[int]) -> Suggester:
|
||||||
"""Suggest all spans of the given lengths. Spans are returned as a ragged
|
"""Suggest all spans of the given lengths. Spans are returned as a ragged
|
||||||
array of integers. The array has two columns, indicating the start and end
|
array of integers. The array has two columns, indicating the start and end
|
||||||
|
@ -143,7 +142,6 @@ def build_ngram_suggester(sizes: List[int]) -> Suggester:
|
||||||
return partial(ngram_suggester, sizes=sizes)
|
return partial(ngram_suggester, sizes=sizes)
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.ngram_range_suggester.v1")
|
|
||||||
def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
|
def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
|
||||||
"""Suggest all spans of the given lengths between a given min and max value - both inclusive.
|
"""Suggest all spans of the given lengths between a given min and max value - both inclusive.
|
||||||
Spans are returned as a ragged array of integers. The array has two columns,
|
Spans are returned as a ragged array of integers. The array has two columns,
|
||||||
|
@ -152,7 +150,6 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
|
||||||
return build_ngram_suggester(sizes)
|
return build_ngram_suggester(sizes)
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.preset_spans_suggester.v1")
|
|
||||||
def build_preset_spans_suggester(spans_key: str) -> Suggester:
|
def build_preset_spans_suggester(spans_key: str) -> Suggester:
|
||||||
"""Suggest all spans that are already stored in doc.spans[spans_key].
|
"""Suggest all spans that are already stored in doc.spans[spans_key].
|
||||||
This is useful when an upstream component is used to set the spans
|
This is useful when an upstream component is used to set the spans
|
||||||
|
@ -160,19 +157,6 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester:
|
||||||
return partial(preset_spans_suggester, spans_key=spans_key)
|
return partial(preset_spans_suggester, spans_key=spans_key)
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"spancat",
|
|
||||||
assigns=["doc.spans"],
|
|
||||||
default_config={
|
|
||||||
"threshold": 0.5,
|
|
||||||
"spans_key": DEFAULT_SPANS_KEY,
|
|
||||||
"max_positive": None,
|
|
||||||
"model": DEFAULT_SPANCAT_MODEL,
|
|
||||||
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
|
||||||
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
|
||||||
)
|
|
||||||
def make_spancat(
|
def make_spancat(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
|
@ -225,19 +209,6 @@ def make_spancat(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"spancat_singlelabel",
|
|
||||||
assigns=["doc.spans"],
|
|
||||||
default_config={
|
|
||||||
"spans_key": DEFAULT_SPANS_KEY,
|
|
||||||
"model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
|
||||||
"negative_weight": 1.0,
|
|
||||||
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
|
||||||
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
|
||||||
"allow_overlap": True,
|
|
||||||
},
|
|
||||||
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
|
||||||
)
|
|
||||||
def make_spancat_singlelabel(
|
def make_spancat_singlelabel(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
|
@ -303,7 +274,6 @@ def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
return Scorer.score_spans(examples, **kwargs)
|
return Scorer.score_spans(examples, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
@registry.scorers("spacy.spancat_scorer.v1")
|
|
||||||
def make_spancat_scorer():
|
def make_spancat_scorer():
|
||||||
return spancat_score
|
return spancat_score
|
||||||
|
|
||||||
|
|
|
@ -35,12 +35,6 @@ subword_features = true
|
||||||
DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"tagger",
|
|
||||||
assigns=["token.tag"],
|
|
||||||
default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0},
|
|
||||||
default_score_weights={"tag_acc": 1.0},
|
|
||||||
)
|
|
||||||
def make_tagger(
|
def make_tagger(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
|
@ -64,7 +58,6 @@ def tagger_score(examples, **kwargs):
|
||||||
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
||||||
|
|
||||||
|
|
||||||
@registry.scorers("spacy.tagger_scorer.v1")
|
|
||||||
def make_tagger_scorer():
|
def make_tagger_scorer():
|
||||||
return tagger_score
|
return tagger_score
|
||||||
|
|
||||||
|
|
|
@ -74,27 +74,6 @@ subword_features = true
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"textcat",
|
|
||||||
assigns=["doc.cats"],
|
|
||||||
default_config={
|
|
||||||
"threshold": 0.0,
|
|
||||||
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
|
||||||
"scorer": {"@scorers": "spacy.textcat_scorer.v2"},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"cats_score": 1.0,
|
|
||||||
"cats_score_desc": None,
|
|
||||||
"cats_micro_p": None,
|
|
||||||
"cats_micro_r": None,
|
|
||||||
"cats_micro_f": None,
|
|
||||||
"cats_macro_p": None,
|
|
||||||
"cats_macro_r": None,
|
|
||||||
"cats_macro_f": None,
|
|
||||||
"cats_macro_auc": None,
|
|
||||||
"cats_f_per_type": None,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
def make_textcat(
|
def make_textcat(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
|
@ -123,7 +102,6 @@ def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@registry.scorers("spacy.textcat_scorer.v2")
|
|
||||||
def make_textcat_scorer():
|
def make_textcat_scorer():
|
||||||
return textcat_score
|
return textcat_score
|
||||||
|
|
||||||
|
|
|
@ -72,27 +72,6 @@ subword_features = true
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"textcat_multilabel",
|
|
||||||
assigns=["doc.cats"],
|
|
||||||
default_config={
|
|
||||||
"threshold": 0.5,
|
|
||||||
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
|
||||||
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
|
|
||||||
},
|
|
||||||
default_score_weights={
|
|
||||||
"cats_score": 1.0,
|
|
||||||
"cats_score_desc": None,
|
|
||||||
"cats_micro_p": None,
|
|
||||||
"cats_micro_r": None,
|
|
||||||
"cats_micro_f": None,
|
|
||||||
"cats_macro_p": None,
|
|
||||||
"cats_macro_r": None,
|
|
||||||
"cats_macro_f": None,
|
|
||||||
"cats_macro_auc": None,
|
|
||||||
"cats_f_per_type": None,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
def make_multilabel_textcat(
|
def make_multilabel_textcat(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
|
@ -124,7 +103,6 @@ def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@registry.scorers("spacy.textcat_multilabel_scorer.v2")
|
|
||||||
def make_textcat_multilabel_scorer():
|
def make_textcat_multilabel_scorer():
|
||||||
return textcat_multilabel_score
|
return textcat_multilabel_score
|
||||||
|
|
||||||
|
|
|
@ -24,9 +24,6 @@ subword_features = true
|
||||||
DEFAULT_TOK2VEC_MODEL = Config().from_str(default_model_config)["model"]
|
DEFAULT_TOK2VEC_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"tok2vec", assigns=["doc.tensor"], default_config={"model": DEFAULT_TOK2VEC_MODEL}
|
|
||||||
)
|
|
||||||
def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec":
|
def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec":
|
||||||
return Tok2Vec(nlp.vocab, model, name)
|
return Tok2Vec(nlp.vocab, model, name)
|
||||||
|
|
||||||
|
|
506
spacy/registrations.py
Normal file
506
spacy/registrations.py
Normal file
|
@ -0,0 +1,506 @@
|
||||||
|
"""Centralized registry population for spaCy components.
|
||||||
|
|
||||||
|
This module centralizes registry decorations to prevent circular import issues
|
||||||
|
with Cython annotation changes from __future__ import annotations. Functions
|
||||||
|
remain in their original locations, but decoration is moved here.
|
||||||
|
"""
|
||||||
|
from typing import Dict, Any, Callable, Iterable, List, Optional, Union
|
||||||
|
|
||||||
|
# Global flag to track if registry has been populated
|
||||||
|
REGISTRY_POPULATED = False
|
||||||
|
|
||||||
|
# Global flag to track if factories have been registered
|
||||||
|
FACTORIES_REGISTERED = False
|
||||||
|
|
||||||
|
def populate_registry() -> None:
|
||||||
|
"""Populate the registry with all necessary components.
|
||||||
|
|
||||||
|
This function should be called before accessing the registry, to ensure
|
||||||
|
it's populated. The function uses a global flag to prevent repopulation.
|
||||||
|
"""
|
||||||
|
global REGISTRY_POPULATED
|
||||||
|
if REGISTRY_POPULATED:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Import all necessary modules
|
||||||
|
from .util import registry, make_first_longest_spans_filter
|
||||||
|
|
||||||
|
# Import all pipeline components that were using registry decorators
|
||||||
|
from .pipeline.tagger import make_tagger_scorer
|
||||||
|
from .pipeline.ner import make_ner_scorer
|
||||||
|
from .pipeline.lemmatizer import make_lemmatizer_scorer
|
||||||
|
from .pipeline.span_finder import make_span_finder_scorer
|
||||||
|
from .pipeline.spancat import make_spancat_scorer, build_ngram_suggester, build_ngram_range_suggester, build_preset_spans_suggester
|
||||||
|
from .pipeline.entityruler import make_entity_ruler_scorer as make_entityruler_scorer
|
||||||
|
from .pipeline.sentencizer import senter_score as make_sentencizer_scorer
|
||||||
|
from .pipeline.senter import make_senter_scorer
|
||||||
|
from .pipeline.textcat import make_textcat_scorer
|
||||||
|
from .pipeline.textcat_multilabel import make_textcat_multilabel_scorer
|
||||||
|
|
||||||
|
# Register miscellaneous components
|
||||||
|
registry.misc("spacy.first_longest_spans_filter.v1")(make_first_longest_spans_filter)
|
||||||
|
registry.misc("spacy.ngram_suggester.v1")(build_ngram_suggester)
|
||||||
|
registry.misc("spacy.ngram_range_suggester.v1")(build_ngram_range_suggester)
|
||||||
|
registry.misc("spacy.preset_spans_suggester.v1")(build_preset_spans_suggester)
|
||||||
|
|
||||||
|
# Need to get references to the existing functions in registry by importing the function that is there
|
||||||
|
# For the registry that was previously decorated
|
||||||
|
|
||||||
|
# Import ML components that use registry
|
||||||
|
from .ml.models.tok2vec import tok2vec_listener_v1, build_hash_embed_cnn_tok2vec, build_Tok2Vec_model, MultiHashEmbed, CharacterEmbed, MaxoutWindowEncoder, MishWindowEncoder, BiLSTMEncoder
|
||||||
|
|
||||||
|
# Register scorers
|
||||||
|
registry.scorers("spacy.tagger_scorer.v1")(make_tagger_scorer)
|
||||||
|
registry.scorers("spacy.ner_scorer.v1")(make_ner_scorer)
|
||||||
|
# span_ruler_scorer removed as it's not in span_ruler.py
|
||||||
|
registry.scorers("spacy.entity_ruler_scorer.v1")(make_entityruler_scorer)
|
||||||
|
registry.scorers("spacy.sentencizer_scorer.v1")(make_sentencizer_scorer)
|
||||||
|
registry.scorers("spacy.senter_scorer.v1")(make_senter_scorer)
|
||||||
|
registry.scorers("spacy.textcat_scorer.v1")(make_textcat_scorer)
|
||||||
|
registry.scorers("spacy.textcat_scorer.v2")(make_textcat_scorer)
|
||||||
|
registry.scorers("spacy.textcat_multilabel_scorer.v1")(make_textcat_multilabel_scorer)
|
||||||
|
registry.scorers("spacy.textcat_multilabel_scorer.v2")(make_textcat_multilabel_scorer)
|
||||||
|
registry.scorers("spacy.lemmatizer_scorer.v1")(make_lemmatizer_scorer)
|
||||||
|
registry.scorers("spacy.span_finder_scorer.v1")(make_span_finder_scorer)
|
||||||
|
registry.scorers("spacy.spancat_scorer.v1")(make_spancat_scorer)
|
||||||
|
|
||||||
|
# Register tok2vec architectures we've modified
|
||||||
|
registry.architectures("spacy.Tok2VecListener.v1")(tok2vec_listener_v1)
|
||||||
|
registry.architectures("spacy.HashEmbedCNN.v2")(build_hash_embed_cnn_tok2vec)
|
||||||
|
registry.architectures("spacy.Tok2Vec.v2")(build_Tok2Vec_model)
|
||||||
|
registry.architectures("spacy.MultiHashEmbed.v2")(MultiHashEmbed)
|
||||||
|
registry.architectures("spacy.CharacterEmbed.v2")(CharacterEmbed)
|
||||||
|
registry.architectures("spacy.MaxoutWindowEncoder.v2")(MaxoutWindowEncoder)
|
||||||
|
registry.architectures("spacy.MishWindowEncoder.v2")(MishWindowEncoder)
|
||||||
|
registry.architectures("spacy.TorchBiLSTMEncoder.v1")(BiLSTMEncoder)
|
||||||
|
|
||||||
|
# Register factory components
|
||||||
|
register_factories()
|
||||||
|
|
||||||
|
# Set the flag to indicate that the registry has been populated
|
||||||
|
REGISTRY_POPULATED = True
|
||||||
|
|
||||||
|
|
||||||
|
def register_factories() -> None:
|
||||||
|
"""Register all factories with the registry.
|
||||||
|
|
||||||
|
This function registers all pipeline component factories, centralizing
|
||||||
|
the registrations that were previously done with @Language.factory decorators.
|
||||||
|
"""
|
||||||
|
global FACTORIES_REGISTERED
|
||||||
|
if FACTORIES_REGISTERED:
|
||||||
|
return
|
||||||
|
|
||||||
|
from .language import Language
|
||||||
|
|
||||||
|
# Import factory default configurations
|
||||||
|
from .pipeline.entity_linker import DEFAULT_NEL_MODEL
|
||||||
|
from .pipeline.entityruler import DEFAULT_ENT_ID_SEP
|
||||||
|
from .pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||||
|
from .pipeline.senter import DEFAULT_SENTER_MODEL
|
||||||
|
from .pipeline.morphologizer import DEFAULT_MORPH_MODEL
|
||||||
|
from .pipeline.spancat import DEFAULT_SPANCAT_MODEL, DEFAULT_SPANCAT_SINGLELABEL_MODEL, DEFAULT_SPANS_KEY
|
||||||
|
from .pipeline.span_ruler import DEFAULT_SPANS_KEY as SPAN_RULER_DEFAULT_SPANS_KEY
|
||||||
|
from .pipeline.edit_tree_lemmatizer import DEFAULT_EDIT_TREE_LEMMATIZER_MODEL
|
||||||
|
from .pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL
|
||||||
|
from .pipeline.span_finder import DEFAULT_SPAN_FINDER_MODEL
|
||||||
|
from .pipeline.ner import DEFAULT_NER_MODEL
|
||||||
|
from .pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||||
|
from .pipeline.tagger import DEFAULT_TAGGER_MODEL
|
||||||
|
from .pipeline.multitask import DEFAULT_MT_MODEL
|
||||||
|
|
||||||
|
# Import all factory functions
|
||||||
|
from .pipeline.attributeruler import make_attribute_ruler
|
||||||
|
from .pipeline.entity_linker import make_entity_linker
|
||||||
|
from .pipeline.entityruler import make_entity_ruler
|
||||||
|
from .pipeline.lemmatizer import make_lemmatizer
|
||||||
|
from .pipeline.textcat import make_textcat, DEFAULT_SINGLE_TEXTCAT_MODEL
|
||||||
|
from .pipeline.functions import make_token_splitter, make_doc_cleaner
|
||||||
|
from .pipeline.tok2vec import make_tok2vec
|
||||||
|
from .pipeline.senter import make_senter
|
||||||
|
from .pipeline.morphologizer import make_morphologizer
|
||||||
|
from .pipeline.spancat import make_spancat, make_spancat_singlelabel
|
||||||
|
from .pipeline.span_ruler import make_entity_ruler as make_span_entity_ruler, make_span_ruler
|
||||||
|
from .pipeline.edit_tree_lemmatizer import make_edit_tree_lemmatizer
|
||||||
|
from .pipeline.textcat_multilabel import make_multilabel_textcat
|
||||||
|
from .pipeline.span_finder import make_span_finder
|
||||||
|
from .pipeline.ner import make_ner, make_beam_ner
|
||||||
|
from .pipeline.dep_parser import make_parser, make_beam_parser
|
||||||
|
from .pipeline.tagger import make_tagger
|
||||||
|
from .pipeline.multitask import make_nn_labeller
|
||||||
|
from .pipeline.sentencizer import make_sentencizer
|
||||||
|
|
||||||
|
# Register factories using the same pattern as Language.factory decorator
|
||||||
|
# We use Language.factory()() pattern which exactly mimics the decorator
|
||||||
|
|
||||||
|
# attributeruler
|
||||||
|
Language.factory(
|
||||||
|
"attribute_ruler",
|
||||||
|
default_config={
|
||||||
|
"validate": False,
|
||||||
|
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
|
||||||
|
},
|
||||||
|
)(make_attribute_ruler)
|
||||||
|
|
||||||
|
# entity_linker
|
||||||
|
Language.factory(
|
||||||
|
"entity_linker",
|
||||||
|
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
||||||
|
assigns=["token.ent_kb_id"],
|
||||||
|
default_config={
|
||||||
|
"model": DEFAULT_NEL_MODEL,
|
||||||
|
"labels_discard": [],
|
||||||
|
"n_sents": 0,
|
||||||
|
"incl_prior": True,
|
||||||
|
"incl_context": True,
|
||||||
|
"entity_vector_length": 64,
|
||||||
|
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||||
|
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
||||||
|
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
|
||||||
|
"overwrite": True,
|
||||||
|
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||||
|
"use_gold_ents": True,
|
||||||
|
"candidates_batch_size": 1,
|
||||||
|
"threshold": None,
|
||||||
|
},
|
||||||
|
default_score_weights={
|
||||||
|
"nel_micro_f": 1.0,
|
||||||
|
"nel_micro_r": None,
|
||||||
|
"nel_micro_p": None,
|
||||||
|
},
|
||||||
|
)(make_entity_linker)
|
||||||
|
|
||||||
|
# entity_ruler
|
||||||
|
Language.factory(
|
||||||
|
"entity_ruler",
|
||||||
|
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
|
||||||
|
default_config={
|
||||||
|
"phrase_matcher_attr": None,
|
||||||
|
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||||
|
"validate": False,
|
||||||
|
"overwrite_ents": False,
|
||||||
|
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
||||||
|
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
||||||
|
},
|
||||||
|
default_score_weights={
|
||||||
|
"ents_f": 1.0,
|
||||||
|
"ents_p": 0.0,
|
||||||
|
"ents_r": 0.0,
|
||||||
|
"ents_per_type": None,
|
||||||
|
},
|
||||||
|
)(make_entity_ruler)
|
||||||
|
|
||||||
|
# lemmatizer
|
||||||
|
Language.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "lookup",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)(make_lemmatizer)
|
||||||
|
|
||||||
|
# textcat
|
||||||
|
Language.factory(
|
||||||
|
"textcat",
|
||||||
|
assigns=["doc.cats"],
|
||||||
|
default_config={
|
||||||
|
"threshold": 0.0,
|
||||||
|
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
||||||
|
"scorer": {"@scorers": "spacy.textcat_scorer.v2"},
|
||||||
|
},
|
||||||
|
default_score_weights={
|
||||||
|
"cats_score": 1.0,
|
||||||
|
"cats_score_desc": None,
|
||||||
|
"cats_micro_p": None,
|
||||||
|
"cats_micro_r": None,
|
||||||
|
"cats_micro_f": None,
|
||||||
|
"cats_macro_p": None,
|
||||||
|
"cats_macro_r": None,
|
||||||
|
"cats_macro_f": None,
|
||||||
|
"cats_macro_auc": None,
|
||||||
|
"cats_f_per_type": None,
|
||||||
|
},
|
||||||
|
)(make_textcat)
|
||||||
|
|
||||||
|
# token_splitter
|
||||||
|
Language.factory(
|
||||||
|
"token_splitter",
|
||||||
|
default_config={"min_length": 25, "split_length": 10},
|
||||||
|
retokenizes=True,
|
||||||
|
)(make_token_splitter)
|
||||||
|
|
||||||
|
# doc_cleaner
|
||||||
|
Language.factory(
|
||||||
|
"doc_cleaner",
|
||||||
|
default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True},
|
||||||
|
)(make_doc_cleaner)
|
||||||
|
|
||||||
|
# tok2vec
|
||||||
|
Language.factory(
|
||||||
|
"tok2vec",
|
||||||
|
assigns=["doc.tensor"],
|
||||||
|
default_config={"model": DEFAULT_TOK2VEC_MODEL}
|
||||||
|
)(make_tok2vec)
|
||||||
|
|
||||||
|
# senter
|
||||||
|
Language.factory(
|
||||||
|
"senter",
|
||||||
|
assigns=["token.is_sent_start"],
|
||||||
|
default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
|
||||||
|
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
||||||
|
)(make_senter)
|
||||||
|
|
||||||
|
# morphologizer
|
||||||
|
Language.factory(
|
||||||
|
"morphologizer",
|
||||||
|
assigns=["token.morph", "token.pos"],
|
||||||
|
default_config={
|
||||||
|
"model": DEFAULT_MORPH_MODEL,
|
||||||
|
"overwrite": True,
|
||||||
|
"extend": False,
|
||||||
|
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
|
||||||
|
"label_smoothing": 0.0
|
||||||
|
},
|
||||||
|
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
||||||
|
)(make_morphologizer)
|
||||||
|
|
||||||
|
# spancat
|
||||||
|
Language.factory(
|
||||||
|
"spancat",
|
||||||
|
assigns=["doc.spans"],
|
||||||
|
default_config={
|
||||||
|
"threshold": 0.5,
|
||||||
|
"spans_key": DEFAULT_SPANS_KEY,
|
||||||
|
"max_positive": None,
|
||||||
|
"model": DEFAULT_SPANCAT_MODEL,
|
||||||
|
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||||
|
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
||||||
|
},
|
||||||
|
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||||
|
)(make_spancat)
|
||||||
|
|
||||||
|
# spancat_singlelabel
|
||||||
|
Language.factory(
|
||||||
|
"spancat_singlelabel",
|
||||||
|
assigns=["doc.spans"],
|
||||||
|
default_config={
|
||||||
|
"spans_key": DEFAULT_SPANS_KEY,
|
||||||
|
"model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
||||||
|
"negative_weight": 1.0,
|
||||||
|
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||||
|
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
||||||
|
"allow_overlap": True,
|
||||||
|
},
|
||||||
|
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||||
|
)(make_spancat_singlelabel)
|
||||||
|
|
||||||
|
# future_entity_ruler
|
||||||
|
Language.factory(
|
||||||
|
"future_entity_ruler",
|
||||||
|
assigns=["doc.ents"],
|
||||||
|
default_config={
|
||||||
|
"phrase_matcher_attr": None,
|
||||||
|
"validate": False,
|
||||||
|
"overwrite_ents": False,
|
||||||
|
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
||||||
|
"ent_id_sep": "__unused__",
|
||||||
|
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||||
|
},
|
||||||
|
default_score_weights={
|
||||||
|
"ents_f": 1.0,
|
||||||
|
"ents_p": 0.0,
|
||||||
|
"ents_r": 0.0,
|
||||||
|
"ents_per_type": None,
|
||||||
|
},
|
||||||
|
)(make_span_entity_ruler)
|
||||||
|
|
||||||
|
# span_ruler
|
||||||
|
Language.factory(
|
||||||
|
"span_ruler",
|
||||||
|
assigns=["doc.spans"],
|
||||||
|
default_config={
|
||||||
|
"spans_key": SPAN_RULER_DEFAULT_SPANS_KEY,
|
||||||
|
"spans_filter": None,
|
||||||
|
"annotate_ents": False,
|
||||||
|
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
|
||||||
|
"phrase_matcher_attr": None,
|
||||||
|
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||||
|
"validate": False,
|
||||||
|
"overwrite": True,
|
||||||
|
"scorer": {
|
||||||
|
"@scorers": "spacy.overlapping_labeled_spans_scorer.v1",
|
||||||
|
"spans_key": SPAN_RULER_DEFAULT_SPANS_KEY,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
default_score_weights={
|
||||||
|
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_f": 1.0,
|
||||||
|
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_p": 0.0,
|
||||||
|
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_r": 0.0,
|
||||||
|
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_per_type": None,
|
||||||
|
},
|
||||||
|
)(make_span_ruler)
|
||||||
|
|
||||||
|
# trainable_lemmatizer
|
||||||
|
Language.factory(
|
||||||
|
"trainable_lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
requires=[],
|
||||||
|
default_config={
|
||||||
|
"model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL,
|
||||||
|
"backoff": "orth",
|
||||||
|
"min_tree_freq": 3,
|
||||||
|
"overwrite": False,
|
||||||
|
"top_k": 1,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)(make_edit_tree_lemmatizer)
|
||||||
|
|
||||||
|
# textcat_multilabel
|
||||||
|
Language.factory(
|
||||||
|
"textcat_multilabel",
|
||||||
|
assigns=["doc.cats"],
|
||||||
|
default_config={
|
||||||
|
"threshold": 0.5,
|
||||||
|
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
||||||
|
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
|
||||||
|
},
|
||||||
|
default_score_weights={
|
||||||
|
"cats_score": 1.0,
|
||||||
|
"cats_score_desc": None,
|
||||||
|
"cats_micro_p": None,
|
||||||
|
"cats_micro_r": None,
|
||||||
|
"cats_micro_f": None,
|
||||||
|
"cats_macro_p": None,
|
||||||
|
"cats_macro_r": None,
|
||||||
|
"cats_macro_f": None,
|
||||||
|
"cats_macro_auc": None,
|
||||||
|
"cats_f_per_type": None,
|
||||||
|
},
|
||||||
|
)(make_multilabel_textcat)
|
||||||
|
|
||||||
|
# span_finder
|
||||||
|
Language.factory(
|
||||||
|
"span_finder",
|
||||||
|
assigns=["doc.spans"],
|
||||||
|
default_config={
|
||||||
|
"threshold": 0.5,
|
||||||
|
"model": DEFAULT_SPAN_FINDER_MODEL,
|
||||||
|
"spans_key": DEFAULT_SPANS_KEY,
|
||||||
|
"max_length": 25,
|
||||||
|
"min_length": None,
|
||||||
|
"scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
|
||||||
|
},
|
||||||
|
default_score_weights={
|
||||||
|
f"spans_{DEFAULT_SPANS_KEY}_f": 1.0,
|
||||||
|
f"spans_{DEFAULT_SPANS_KEY}_p": 0.0,
|
||||||
|
f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
|
||||||
|
},
|
||||||
|
)(make_span_finder)
|
||||||
|
|
||||||
|
# ner
|
||||||
|
Language.factory(
|
||||||
|
"ner",
|
||||||
|
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
||||||
|
default_config={
|
||||||
|
"moves": None,
|
||||||
|
"update_with_oracle_cut_size": 100,
|
||||||
|
"model": DEFAULT_NER_MODEL,
|
||||||
|
"incorrect_spans_key": None,
|
||||||
|
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
||||||
|
},
|
||||||
|
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||||
|
)(make_ner)
|
||||||
|
|
||||||
|
# beam_ner
|
||||||
|
Language.factory(
|
||||||
|
"beam_ner",
|
||||||
|
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
||||||
|
default_config={
|
||||||
|
"moves": None,
|
||||||
|
"update_with_oracle_cut_size": 100,
|
||||||
|
"model": DEFAULT_NER_MODEL,
|
||||||
|
"beam_density": 0.01,
|
||||||
|
"beam_update_prob": 0.5,
|
||||||
|
"beam_width": 32,
|
||||||
|
"incorrect_spans_key": None,
|
||||||
|
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
||||||
|
},
|
||||||
|
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||||
|
)(make_beam_ner)
|
||||||
|
|
||||||
|
# parser
|
||||||
|
Language.factory(
|
||||||
|
"parser",
|
||||||
|
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
||||||
|
default_config={
|
||||||
|
"moves": None,
|
||||||
|
"update_with_oracle_cut_size": 100,
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"model": DEFAULT_PARSER_MODEL,
|
||||||
|
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
||||||
|
},
|
||||||
|
default_score_weights={
|
||||||
|
"dep_uas": 0.5,
|
||||||
|
"dep_las": 0.5,
|
||||||
|
"dep_las_per_type": None,
|
||||||
|
"sents_p": None,
|
||||||
|
"sents_r": None,
|
||||||
|
"sents_f": 0.0,
|
||||||
|
},
|
||||||
|
)(make_parser)
|
||||||
|
|
||||||
|
# beam_parser
|
||||||
|
Language.factory(
|
||||||
|
"beam_parser",
|
||||||
|
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
||||||
|
default_config={
|
||||||
|
"moves": None,
|
||||||
|
"update_with_oracle_cut_size": 100,
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 8,
|
||||||
|
"beam_density": 0.0001,
|
||||||
|
"beam_update_prob": 0.5,
|
||||||
|
"model": DEFAULT_PARSER_MODEL,
|
||||||
|
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
||||||
|
},
|
||||||
|
default_score_weights={
|
||||||
|
"dep_uas": 0.5,
|
||||||
|
"dep_las": 0.5,
|
||||||
|
"dep_las_per_type": None,
|
||||||
|
"sents_p": None,
|
||||||
|
"sents_r": None,
|
||||||
|
"sents_f": 0.0,
|
||||||
|
},
|
||||||
|
)(make_beam_parser)
|
||||||
|
|
||||||
|
# tagger
|
||||||
|
Language.factory(
|
||||||
|
"tagger",
|
||||||
|
assigns=["token.tag"],
|
||||||
|
default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0},
|
||||||
|
default_score_weights={"tag_acc": 1.0, "pos_acc": 0.0, "tag_micro_p": None, "tag_micro_r": None, "tag_micro_f": None},
|
||||||
|
)(make_tagger)
|
||||||
|
|
||||||
|
# nn_labeller
|
||||||
|
Language.factory(
|
||||||
|
"nn_labeller",
|
||||||
|
default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL}
|
||||||
|
)(make_nn_labeller)
|
||||||
|
|
||||||
|
# sentencizer
|
||||||
|
Language.factory(
|
||||||
|
"sentencizer",
|
||||||
|
assigns=["token.is_sent_start", "doc.sents"],
|
||||||
|
default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
|
||||||
|
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
||||||
|
)(make_sentencizer)
|
||||||
|
|
||||||
|
# Set the flag to indicate that all factories have been registered
|
||||||
|
FACTORIES_REGISTERED = True
|
132
spacy/tests/factory_registrations.json
Normal file
132
spacy/tests/factory_registrations.json
Normal file
|
@ -0,0 +1,132 @@
|
||||||
|
{
|
||||||
|
"attribute_ruler": {
|
||||||
|
"name": "attribute_ruler",
|
||||||
|
"module": "spacy.pipeline.attributeruler",
|
||||||
|
"function": "make_attribute_ruler"
|
||||||
|
},
|
||||||
|
"beam_ner": {
|
||||||
|
"name": "beam_ner",
|
||||||
|
"module": "spacy.pipeline.ner",
|
||||||
|
"function": "make_beam_ner"
|
||||||
|
},
|
||||||
|
"beam_parser": {
|
||||||
|
"name": "beam_parser",
|
||||||
|
"module": "spacy.pipeline.dep_parser",
|
||||||
|
"function": "make_beam_parser"
|
||||||
|
},
|
||||||
|
"doc_cleaner": {
|
||||||
|
"name": "doc_cleaner",
|
||||||
|
"module": "spacy.pipeline.functions",
|
||||||
|
"function": "make_doc_cleaner"
|
||||||
|
},
|
||||||
|
"entity_linker": {
|
||||||
|
"name": "entity_linker",
|
||||||
|
"module": "spacy.pipeline.entity_linker",
|
||||||
|
"function": "make_entity_linker"
|
||||||
|
},
|
||||||
|
"entity_ruler": {
|
||||||
|
"name": "entity_ruler",
|
||||||
|
"module": "spacy.pipeline.entityruler",
|
||||||
|
"function": "make_entity_ruler"
|
||||||
|
},
|
||||||
|
"future_entity_ruler": {
|
||||||
|
"name": "future_entity_ruler",
|
||||||
|
"module": "spacy.pipeline.span_ruler",
|
||||||
|
"function": "make_entity_ruler"
|
||||||
|
},
|
||||||
|
"lemmatizer": {
|
||||||
|
"name": "lemmatizer",
|
||||||
|
"module": "spacy.pipeline.lemmatizer",
|
||||||
|
"function": "make_lemmatizer"
|
||||||
|
},
|
||||||
|
"merge_entities": {
|
||||||
|
"name": "merge_entities",
|
||||||
|
"module": "spacy.language",
|
||||||
|
"function": "Language.component.<locals>.add_component.<locals>.factory_func"
|
||||||
|
},
|
||||||
|
"merge_noun_chunks": {
|
||||||
|
"name": "merge_noun_chunks",
|
||||||
|
"module": "spacy.language",
|
||||||
|
"function": "Language.component.<locals>.add_component.<locals>.factory_func"
|
||||||
|
},
|
||||||
|
"merge_subtokens": {
|
||||||
|
"name": "merge_subtokens",
|
||||||
|
"module": "spacy.language",
|
||||||
|
"function": "Language.component.<locals>.add_component.<locals>.factory_func"
|
||||||
|
},
|
||||||
|
"morphologizer": {
|
||||||
|
"name": "morphologizer",
|
||||||
|
"module": "spacy.pipeline.morphologizer",
|
||||||
|
"function": "make_morphologizer"
|
||||||
|
},
|
||||||
|
"ner": {
|
||||||
|
"name": "ner",
|
||||||
|
"module": "spacy.pipeline.ner",
|
||||||
|
"function": "make_ner"
|
||||||
|
},
|
||||||
|
"parser": {
|
||||||
|
"name": "parser",
|
||||||
|
"module": "spacy.pipeline.dep_parser",
|
||||||
|
"function": "make_parser"
|
||||||
|
},
|
||||||
|
"sentencizer": {
|
||||||
|
"name": "sentencizer",
|
||||||
|
"module": "spacy.pipeline.sentencizer",
|
||||||
|
"function": "make_sentencizer"
|
||||||
|
},
|
||||||
|
"senter": {
|
||||||
|
"name": "senter",
|
||||||
|
"module": "spacy.pipeline.senter",
|
||||||
|
"function": "make_senter"
|
||||||
|
},
|
||||||
|
"span_finder": {
|
||||||
|
"name": "span_finder",
|
||||||
|
"module": "spacy.pipeline.span_finder",
|
||||||
|
"function": "make_span_finder"
|
||||||
|
},
|
||||||
|
"span_ruler": {
|
||||||
|
"name": "span_ruler",
|
||||||
|
"module": "spacy.pipeline.span_ruler",
|
||||||
|
"function": "make_span_ruler"
|
||||||
|
},
|
||||||
|
"spancat": {
|
||||||
|
"name": "spancat",
|
||||||
|
"module": "spacy.pipeline.spancat",
|
||||||
|
"function": "make_spancat"
|
||||||
|
},
|
||||||
|
"spancat_singlelabel": {
|
||||||
|
"name": "spancat_singlelabel",
|
||||||
|
"module": "spacy.pipeline.spancat",
|
||||||
|
"function": "make_spancat_singlelabel"
|
||||||
|
},
|
||||||
|
"tagger": {
|
||||||
|
"name": "tagger",
|
||||||
|
"module": "spacy.pipeline.tagger",
|
||||||
|
"function": "make_tagger"
|
||||||
|
},
|
||||||
|
"textcat": {
|
||||||
|
"name": "textcat",
|
||||||
|
"module": "spacy.pipeline.textcat",
|
||||||
|
"function": "make_textcat"
|
||||||
|
},
|
||||||
|
"textcat_multilabel": {
|
||||||
|
"name": "textcat_multilabel",
|
||||||
|
"module": "spacy.pipeline.textcat_multilabel",
|
||||||
|
"function": "make_multilabel_textcat"
|
||||||
|
},
|
||||||
|
"tok2vec": {
|
||||||
|
"name": "tok2vec",
|
||||||
|
"module": "spacy.pipeline.tok2vec",
|
||||||
|
"function": "make_tok2vec"
|
||||||
|
},
|
||||||
|
"token_splitter": {
|
||||||
|
"name": "token_splitter",
|
||||||
|
"module": "spacy.pipeline.functions",
|
||||||
|
"function": "make_token_splitter"
|
||||||
|
},
|
||||||
|
"trainable_lemmatizer": {
|
||||||
|
"name": "trainable_lemmatizer",
|
||||||
|
"module": "spacy.pipeline.edit_tree_lemmatizer",
|
||||||
|
"function": "make_edit_tree_lemmatizer"
|
||||||
|
}
|
||||||
|
}
|
284
spacy/tests/registry_contents.json
Normal file
284
spacy/tests/registry_contents.json
Normal file
|
@ -0,0 +1,284 @@
|
||||||
|
{
|
||||||
|
"architectures": [
|
||||||
|
"spacy-legacy.CharacterEmbed.v1",
|
||||||
|
"spacy-legacy.EntityLinker.v1",
|
||||||
|
"spacy-legacy.HashEmbedCNN.v1",
|
||||||
|
"spacy-legacy.MaxoutWindowEncoder.v1",
|
||||||
|
"spacy-legacy.MishWindowEncoder.v1",
|
||||||
|
"spacy-legacy.MultiHashEmbed.v1",
|
||||||
|
"spacy-legacy.Tagger.v1",
|
||||||
|
"spacy-legacy.TextCatBOW.v1",
|
||||||
|
"spacy-legacy.TextCatCNN.v1",
|
||||||
|
"spacy-legacy.TextCatEnsemble.v1",
|
||||||
|
"spacy-legacy.Tok2Vec.v1",
|
||||||
|
"spacy-legacy.TransitionBasedParser.v1",
|
||||||
|
"spacy.CharacterEmbed.v2",
|
||||||
|
"spacy.EntityLinker.v2",
|
||||||
|
"spacy.HashEmbedCNN.v2",
|
||||||
|
"spacy.MaxoutWindowEncoder.v2",
|
||||||
|
"spacy.MishWindowEncoder.v2",
|
||||||
|
"spacy.MultiHashEmbed.v2",
|
||||||
|
"spacy.PretrainCharacters.v1",
|
||||||
|
"spacy.PretrainVectors.v1",
|
||||||
|
"spacy.SpanCategorizer.v1",
|
||||||
|
"spacy.SpanFinder.v1",
|
||||||
|
"spacy.Tagger.v2",
|
||||||
|
"spacy.TextCatBOW.v2",
|
||||||
|
"spacy.TextCatBOW.v3",
|
||||||
|
"spacy.TextCatCNN.v2",
|
||||||
|
"spacy.TextCatEnsemble.v2",
|
||||||
|
"spacy.TextCatLowData.v1",
|
||||||
|
"spacy.TextCatParametricAttention.v1",
|
||||||
|
"spacy.TextCatReduce.v1",
|
||||||
|
"spacy.Tok2Vec.v2",
|
||||||
|
"spacy.Tok2VecListener.v1",
|
||||||
|
"spacy.TorchBiLSTMEncoder.v1",
|
||||||
|
"spacy.TransitionBasedParser.v2"
|
||||||
|
],
|
||||||
|
"augmenters": [
|
||||||
|
"spacy.combined_augmenter.v1",
|
||||||
|
"spacy.lower_case.v1",
|
||||||
|
"spacy.orth_variants.v1"
|
||||||
|
],
|
||||||
|
"batchers": [
|
||||||
|
"spacy.batch_by_padded.v1",
|
||||||
|
"spacy.batch_by_sequence.v1",
|
||||||
|
"spacy.batch_by_words.v1"
|
||||||
|
],
|
||||||
|
"callbacks": [
|
||||||
|
"spacy.copy_from_base_model.v1",
|
||||||
|
"spacy.models_and_pipes_with_nvtx_range.v1",
|
||||||
|
"spacy.models_with_nvtx_range.v1"
|
||||||
|
],
|
||||||
|
"cli": [],
|
||||||
|
"datasets": [],
|
||||||
|
"displacy_colors": [],
|
||||||
|
"factories": [
|
||||||
|
"attribute_ruler",
|
||||||
|
"beam_ner",
|
||||||
|
"beam_parser",
|
||||||
|
"doc_cleaner",
|
||||||
|
"entity_linker",
|
||||||
|
"entity_ruler",
|
||||||
|
"future_entity_ruler",
|
||||||
|
"lemmatizer",
|
||||||
|
"merge_entities",
|
||||||
|
"merge_noun_chunks",
|
||||||
|
"merge_subtokens",
|
||||||
|
"morphologizer",
|
||||||
|
"ner",
|
||||||
|
"parser",
|
||||||
|
"sentencizer",
|
||||||
|
"senter",
|
||||||
|
"span_finder",
|
||||||
|
"span_ruler",
|
||||||
|
"spancat",
|
||||||
|
"spancat_singlelabel",
|
||||||
|
"tagger",
|
||||||
|
"textcat",
|
||||||
|
"textcat_multilabel",
|
||||||
|
"tok2vec",
|
||||||
|
"token_splitter",
|
||||||
|
"trainable_lemmatizer"
|
||||||
|
],
|
||||||
|
"initializers": [
|
||||||
|
"glorot_normal_init.v1",
|
||||||
|
"glorot_uniform_init.v1",
|
||||||
|
"he_normal_init.v1",
|
||||||
|
"he_uniform_init.v1",
|
||||||
|
"lecun_normal_init.v1",
|
||||||
|
"lecun_uniform_init.v1",
|
||||||
|
"normal_init.v1",
|
||||||
|
"uniform_init.v1",
|
||||||
|
"zero_init.v1"
|
||||||
|
],
|
||||||
|
"languages": [],
|
||||||
|
"layers": [
|
||||||
|
"CauchySimilarity.v1",
|
||||||
|
"ClippedLinear.v1",
|
||||||
|
"Dish.v1",
|
||||||
|
"Dropout.v1",
|
||||||
|
"Embed.v1",
|
||||||
|
"Gelu.v1",
|
||||||
|
"HardSigmoid.v1",
|
||||||
|
"HardSwish.v1",
|
||||||
|
"HardSwishMobilenet.v1",
|
||||||
|
"HardTanh.v1",
|
||||||
|
"HashEmbed.v1",
|
||||||
|
"LSTM.v1",
|
||||||
|
"LayerNorm.v1",
|
||||||
|
"Linear.v1",
|
||||||
|
"Logistic.v1",
|
||||||
|
"MXNetWrapper.v1",
|
||||||
|
"Maxout.v1",
|
||||||
|
"Mish.v1",
|
||||||
|
"MultiSoftmax.v1",
|
||||||
|
"ParametricAttention.v1",
|
||||||
|
"ParametricAttention.v2",
|
||||||
|
"PyTorchLSTM.v1",
|
||||||
|
"PyTorchRNNWrapper.v1",
|
||||||
|
"PyTorchWrapper.v1",
|
||||||
|
"PyTorchWrapper.v2",
|
||||||
|
"PyTorchWrapper.v3",
|
||||||
|
"Relu.v1",
|
||||||
|
"ReluK.v1",
|
||||||
|
"Sigmoid.v1",
|
||||||
|
"Softmax.v1",
|
||||||
|
"Softmax.v2",
|
||||||
|
"SparseLinear.v1",
|
||||||
|
"SparseLinear.v2",
|
||||||
|
"Swish.v1",
|
||||||
|
"add.v1",
|
||||||
|
"bidirectional.v1",
|
||||||
|
"chain.v1",
|
||||||
|
"clone.v1",
|
||||||
|
"concatenate.v1",
|
||||||
|
"expand_window.v1",
|
||||||
|
"list2array.v1",
|
||||||
|
"list2padded.v1",
|
||||||
|
"list2ragged.v1",
|
||||||
|
"noop.v1",
|
||||||
|
"padded2list.v1",
|
||||||
|
"premap_ids.v1",
|
||||||
|
"ragged2list.v1",
|
||||||
|
"reduce_first.v1",
|
||||||
|
"reduce_last.v1",
|
||||||
|
"reduce_max.v1",
|
||||||
|
"reduce_mean.v1",
|
||||||
|
"reduce_sum.v1",
|
||||||
|
"remap_ids.v1",
|
||||||
|
"remap_ids.v2",
|
||||||
|
"residual.v1",
|
||||||
|
"resizable.v1",
|
||||||
|
"siamese.v1",
|
||||||
|
"sigmoid_activation.v1",
|
||||||
|
"softmax_activation.v1",
|
||||||
|
"spacy-legacy.StaticVectors.v1",
|
||||||
|
"spacy.CharEmbed.v1",
|
||||||
|
"spacy.FeatureExtractor.v1",
|
||||||
|
"spacy.LinearLogistic.v1",
|
||||||
|
"spacy.PrecomputableAffine.v1",
|
||||||
|
"spacy.StaticVectors.v2",
|
||||||
|
"spacy.TransitionModel.v1",
|
||||||
|
"spacy.extract_ngrams.v1",
|
||||||
|
"spacy.extract_spans.v1",
|
||||||
|
"spacy.mean_max_reducer.v1",
|
||||||
|
"strings2arrays.v1",
|
||||||
|
"tuplify.v1",
|
||||||
|
"uniqued.v1",
|
||||||
|
"with_array.v1",
|
||||||
|
"with_array2d.v1",
|
||||||
|
"with_cpu.v1",
|
||||||
|
"with_flatten.v1",
|
||||||
|
"with_flatten.v2",
|
||||||
|
"with_getitem.v1",
|
||||||
|
"with_list.v1",
|
||||||
|
"with_padded.v1",
|
||||||
|
"with_ragged.v1",
|
||||||
|
"with_reshape.v1"
|
||||||
|
],
|
||||||
|
"lemmatizers": [],
|
||||||
|
"loggers": [
|
||||||
|
"spacy-legacy.ConsoleLogger.v1",
|
||||||
|
"spacy-legacy.ConsoleLogger.v2",
|
||||||
|
"spacy-legacy.WandbLogger.v1",
|
||||||
|
"spacy.ChainLogger.v1",
|
||||||
|
"spacy.ClearMLLogger.v1",
|
||||||
|
"spacy.ClearMLLogger.v2",
|
||||||
|
"spacy.ConsoleLogger.v2",
|
||||||
|
"spacy.ConsoleLogger.v3",
|
||||||
|
"spacy.CupyLogger.v1",
|
||||||
|
"spacy.LookupLogger.v1",
|
||||||
|
"spacy.MLflowLogger.v1",
|
||||||
|
"spacy.MLflowLogger.v2",
|
||||||
|
"spacy.PyTorchLogger.v1",
|
||||||
|
"spacy.WandbLogger.v1",
|
||||||
|
"spacy.WandbLogger.v2",
|
||||||
|
"spacy.WandbLogger.v3",
|
||||||
|
"spacy.WandbLogger.v4",
|
||||||
|
"spacy.WandbLogger.v5"
|
||||||
|
],
|
||||||
|
"lookups": [],
|
||||||
|
"losses": [
|
||||||
|
"CategoricalCrossentropy.v1",
|
||||||
|
"CategoricalCrossentropy.v2",
|
||||||
|
"CategoricalCrossentropy.v3",
|
||||||
|
"CosineDistance.v1",
|
||||||
|
"L2Distance.v1",
|
||||||
|
"SequenceCategoricalCrossentropy.v1",
|
||||||
|
"SequenceCategoricalCrossentropy.v2",
|
||||||
|
"SequenceCategoricalCrossentropy.v3"
|
||||||
|
],
|
||||||
|
"misc": [
|
||||||
|
"spacy.CandidateBatchGenerator.v1",
|
||||||
|
"spacy.CandidateGenerator.v1",
|
||||||
|
"spacy.EmptyKB.v1",
|
||||||
|
"spacy.EmptyKB.v2",
|
||||||
|
"spacy.KBFromFile.v1",
|
||||||
|
"spacy.LookupsDataLoader.v1",
|
||||||
|
"spacy.first_longest_spans_filter.v1",
|
||||||
|
"spacy.levenshtein_compare.v1",
|
||||||
|
"spacy.ngram_range_suggester.v1",
|
||||||
|
"spacy.ngram_suggester.v1",
|
||||||
|
"spacy.preset_spans_suggester.v1",
|
||||||
|
"spacy.prioritize_existing_ents_filter.v1",
|
||||||
|
"spacy.prioritize_new_ents_filter.v1"
|
||||||
|
],
|
||||||
|
"models": [],
|
||||||
|
"ops": [
|
||||||
|
"CupyOps",
|
||||||
|
"MPSOps",
|
||||||
|
"NumpyOps"
|
||||||
|
],
|
||||||
|
"optimizers": [
|
||||||
|
"Adam.v1",
|
||||||
|
"RAdam.v1",
|
||||||
|
"SGD.v1"
|
||||||
|
],
|
||||||
|
"readers": [
|
||||||
|
"ml_datasets.cmu_movies.v1",
|
||||||
|
"ml_datasets.dbpedia.v1",
|
||||||
|
"ml_datasets.imdb_sentiment.v1",
|
||||||
|
"spacy.Corpus.v1",
|
||||||
|
"spacy.JsonlCorpus.v1",
|
||||||
|
"spacy.PlainTextCorpus.v1",
|
||||||
|
"spacy.read_labels.v1",
|
||||||
|
"srsly.read_json.v1",
|
||||||
|
"srsly.read_jsonl.v1",
|
||||||
|
"srsly.read_msgpack.v1",
|
||||||
|
"srsly.read_yaml.v1"
|
||||||
|
],
|
||||||
|
"schedules": [
|
||||||
|
"compounding.v1",
|
||||||
|
"constant.v1",
|
||||||
|
"constant_then.v1",
|
||||||
|
"cyclic_triangular.v1",
|
||||||
|
"decaying.v1",
|
||||||
|
"slanted_triangular.v1",
|
||||||
|
"warmup_linear.v1"
|
||||||
|
],
|
||||||
|
"scorers": [
|
||||||
|
"spacy-legacy.textcat_multilabel_scorer.v1",
|
||||||
|
"spacy-legacy.textcat_scorer.v1",
|
||||||
|
"spacy.attribute_ruler_scorer.v1",
|
||||||
|
"spacy.entity_linker_scorer.v1",
|
||||||
|
"spacy.entity_ruler_scorer.v1",
|
||||||
|
"spacy.lemmatizer_scorer.v1",
|
||||||
|
"spacy.morphologizer_scorer.v1",
|
||||||
|
"spacy.ner_scorer.v1",
|
||||||
|
"spacy.overlapping_labeled_spans_scorer.v1",
|
||||||
|
"spacy.parser_scorer.v1",
|
||||||
|
"spacy.senter_scorer.v1",
|
||||||
|
"spacy.span_finder_scorer.v1",
|
||||||
|
"spacy.spancat_scorer.v1",
|
||||||
|
"spacy.tagger_scorer.v1",
|
||||||
|
"spacy.textcat_multilabel_scorer.v2",
|
||||||
|
"spacy.textcat_scorer.v2"
|
||||||
|
],
|
||||||
|
"tokenizers": [
|
||||||
|
"spacy.Tokenizer.v1"
|
||||||
|
],
|
||||||
|
"vectors": [
|
||||||
|
"spacy.Vectors.v1"
|
||||||
|
]
|
||||||
|
}
|
76
spacy/tests/test_factory_registrations.py
Normal file
76
spacy/tests/test_factory_registrations.py
Normal file
|
@ -0,0 +1,76 @@
|
||||||
|
import json
|
||||||
|
import inspect
|
||||||
|
import pytest
|
||||||
|
from pathlib import Path
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.util import registry
|
||||||
|
|
||||||
|
# Path to the reference factory registrations, relative to this file
|
||||||
|
REFERENCE_FILE = Path(__file__).parent / "factory_registrations.json"
|
||||||
|
|
||||||
|
# Monkey patch the util.is_same_func to handle Cython functions
|
||||||
|
import inspect
|
||||||
|
from spacy import util
|
||||||
|
|
||||||
|
original_is_same_func = util.is_same_func
|
||||||
|
|
||||||
|
def patched_is_same_func(func1, func2):
|
||||||
|
# Handle Cython functions
|
||||||
|
try:
|
||||||
|
return original_is_same_func(func1, func2)
|
||||||
|
except TypeError:
|
||||||
|
# For Cython functions, just compare the string representation
|
||||||
|
return str(func1) == str(func2)
|
||||||
|
|
||||||
|
util.is_same_func = patched_is_same_func
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def reference_factory_registrations():
|
||||||
|
"""Load reference factory registrations from JSON file"""
|
||||||
|
if not REFERENCE_FILE.exists():
|
||||||
|
pytest.fail(f"Reference file {REFERENCE_FILE} not found. Run export_factory_registrations.py first.")
|
||||||
|
|
||||||
|
with REFERENCE_FILE.open("r") as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
def test_factory_registrations_preserved(reference_factory_registrations):
|
||||||
|
"""Test that all factory registrations from the reference file are still present."""
|
||||||
|
# Ensure the registry is populated
|
||||||
|
registry.ensure_populated()
|
||||||
|
|
||||||
|
# Get all factory registrations
|
||||||
|
all_factories = registry.factories.get_all()
|
||||||
|
|
||||||
|
# Initialize our data structure to store current factory registrations
|
||||||
|
current_registrations = {}
|
||||||
|
|
||||||
|
# Process factory registrations
|
||||||
|
for name, func in all_factories.items():
|
||||||
|
# Store information about each factory
|
||||||
|
try:
|
||||||
|
module_name = func.__module__
|
||||||
|
except (AttributeError, TypeError):
|
||||||
|
# For Cython functions, just use a placeholder
|
||||||
|
module_name = str(func).split()[1].split('.')[0]
|
||||||
|
|
||||||
|
try:
|
||||||
|
func_name = func.__qualname__
|
||||||
|
except (AttributeError, TypeError):
|
||||||
|
# For Cython functions, use the function's name
|
||||||
|
func_name = func.__name__ if hasattr(func, "__name__") else str(func).split()[1].split('.')[-1]
|
||||||
|
|
||||||
|
current_registrations[name] = {
|
||||||
|
"name": name,
|
||||||
|
"module": module_name,
|
||||||
|
"function": func_name,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check for missing registrations
|
||||||
|
missing_registrations = set(reference_factory_registrations.keys()) - set(current_registrations.keys())
|
||||||
|
assert not missing_registrations, f"Missing factory registrations: {', '.join(sorted(missing_registrations))}"
|
||||||
|
|
||||||
|
# Check for new registrations (not an error, but informative)
|
||||||
|
new_registrations = set(current_registrations.keys()) - set(reference_factory_registrations.keys())
|
||||||
|
if new_registrations:
|
||||||
|
# This is not an error, just informative
|
||||||
|
print(f"New factory registrations found: {', '.join(sorted(new_registrations))}")
|
48
spacy/tests/test_registry_population.py
Normal file
48
spacy/tests/test_registry_population.py
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
from pathlib import Path
|
||||||
|
from spacy.util import registry
|
||||||
|
|
||||||
|
# Path to the reference registry contents, relative to this file
|
||||||
|
REFERENCE_FILE = Path(__file__).parent / "registry_contents.json"
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def reference_registry():
|
||||||
|
"""Load reference registry contents from JSON file"""
|
||||||
|
if not REFERENCE_FILE.exists():
|
||||||
|
pytest.fail(f"Reference file {REFERENCE_FILE} not found.")
|
||||||
|
|
||||||
|
with REFERENCE_FILE.open("r") as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
def test_registry_types(reference_registry):
|
||||||
|
"""Test that all registry types match the reference"""
|
||||||
|
# Get current registry types
|
||||||
|
current_registry_types = set(registry.get_registry_names())
|
||||||
|
expected_registry_types = set(reference_registry.keys())
|
||||||
|
|
||||||
|
# Check for missing registry types
|
||||||
|
missing_types = expected_registry_types - current_registry_types
|
||||||
|
assert not missing_types, f"Missing registry types: {', '.join(missing_types)}"
|
||||||
|
|
||||||
|
def test_registry_entries(reference_registry):
|
||||||
|
"""Test that all registry entries are present"""
|
||||||
|
# Check each registry's entries
|
||||||
|
for registry_name, expected_entries in reference_registry.items():
|
||||||
|
# Skip if this registry type doesn't exist
|
||||||
|
if not hasattr(registry, registry_name):
|
||||||
|
pytest.fail(f"Registry '{registry_name}' does not exist.")
|
||||||
|
|
||||||
|
# Get current entries
|
||||||
|
reg = getattr(registry, registry_name)
|
||||||
|
current_entries = sorted(list(reg.get_all().keys()))
|
||||||
|
|
||||||
|
# Compare entries
|
||||||
|
expected_set = set(expected_entries)
|
||||||
|
current_set = set(current_entries)
|
||||||
|
|
||||||
|
# Check for missing entries - these would indicate our new registry population
|
||||||
|
# mechanism is missing something
|
||||||
|
missing_entries = expected_set - current_set
|
||||||
|
assert not missing_entries, f"Registry '{registry_name}' missing entries: {', '.join(missing_entries)}"
|
|
@ -132,9 +132,17 @@ class registry(thinc.registry):
|
||||||
models = catalogue.create("spacy", "models", entry_points=True)
|
models = catalogue.create("spacy", "models", entry_points=True)
|
||||||
cli = catalogue.create("spacy", "cli", entry_points=True)
|
cli = catalogue.create("spacy", "cli", entry_points=True)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def ensure_populated(cls) -> None:
|
||||||
|
"""Ensure the registry is populated with all necessary components."""
|
||||||
|
from .registrations import populate_registry, REGISTRY_POPULATED
|
||||||
|
if not REGISTRY_POPULATED:
|
||||||
|
populate_registry()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_registry_names(cls) -> List[str]:
|
def get_registry_names(cls) -> List[str]:
|
||||||
"""List all available registries."""
|
"""List all available registries."""
|
||||||
|
cls.ensure_populated()
|
||||||
names = []
|
names = []
|
||||||
for name, value in inspect.getmembers(cls):
|
for name, value in inspect.getmembers(cls):
|
||||||
if not name.startswith("_") and isinstance(value, Registry):
|
if not name.startswith("_") and isinstance(value, Registry):
|
||||||
|
@ -144,6 +152,7 @@ class registry(thinc.registry):
|
||||||
@classmethod
|
@classmethod
|
||||||
def get(cls, registry_name: str, func_name: str) -> Callable:
|
def get(cls, registry_name: str, func_name: str) -> Callable:
|
||||||
"""Get a registered function from the registry."""
|
"""Get a registered function from the registry."""
|
||||||
|
cls.ensure_populated()
|
||||||
# We're overwriting this classmethod so we're able to provide more
|
# We're overwriting this classmethod so we're able to provide more
|
||||||
# specific error messages and implement a fallback to spacy-legacy.
|
# specific error messages and implement a fallback to spacy-legacy.
|
||||||
if not hasattr(cls, registry_name):
|
if not hasattr(cls, registry_name):
|
||||||
|
@ -179,6 +188,7 @@ class registry(thinc.registry):
|
||||||
func_name (str): Name of the registered function.
|
func_name (str): Name of the registered function.
|
||||||
RETURNS (Dict[str, Optional[Union[str, int]]]): The function info.
|
RETURNS (Dict[str, Optional[Union[str, int]]]): The function info.
|
||||||
"""
|
"""
|
||||||
|
cls.ensure_populated()
|
||||||
# We're overwriting this classmethod so we're able to provide more
|
# We're overwriting this classmethod so we're able to provide more
|
||||||
# specific error messages and implement a fallback to spacy-legacy.
|
# specific error messages and implement a fallback to spacy-legacy.
|
||||||
if not hasattr(cls, registry_name):
|
if not hasattr(cls, registry_name):
|
||||||
|
@ -205,6 +215,7 @@ class registry(thinc.registry):
|
||||||
@classmethod
|
@classmethod
|
||||||
def has(cls, registry_name: str, func_name: str) -> bool:
|
def has(cls, registry_name: str, func_name: str) -> bool:
|
||||||
"""Check whether a function is available in a registry."""
|
"""Check whether a function is available in a registry."""
|
||||||
|
cls.ensure_populated()
|
||||||
if not hasattr(cls, registry_name):
|
if not hasattr(cls, registry_name):
|
||||||
return False
|
return False
|
||||||
reg = getattr(cls, registry_name)
|
reg = getattr(cls, registry_name)
|
||||||
|
@ -1323,7 +1334,6 @@ def filter_chain_spans(*spans: Iterable["Span"]) -> List["Span"]:
|
||||||
return filter_spans(itertools.chain(*spans))
|
return filter_spans(itertools.chain(*spans))
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.first_longest_spans_filter.v1")
|
|
||||||
def make_first_longest_spans_filter():
|
def make_first_longest_spans_filter():
|
||||||
return filter_chain_spans
|
return filter_chain_spans
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user