Move registrations

This commit is contained in:
Matthew Honnibal 2025-05-21 23:19:00 +02:00
parent ab5f1c1013
commit c8d7dd968a
13 changed files with 22 additions and 15 deletions

View File

@ -32,7 +32,6 @@ split_mode = null
"""
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
def create_tokenizer(split_mode: Optional[str] = None):
def japanese_tokenizer_factory(nlp):
return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)

View File

@ -20,7 +20,6 @@ DEFAULT_CONFIG = """
"""
@registry.tokenizers("spacy.ko.KoreanTokenizer")
def create_tokenizer():
def korean_tokenizer_factory(nlp):
return KoreanTokenizer(nlp.vocab)

View File

@ -13,7 +13,6 @@ DEFAULT_CONFIG = """
"""
@registry.tokenizers("spacy.th.ThaiTokenizer")
def create_thai_tokenizer():
def thai_tokenizer_factory(nlp):
return ThaiTokenizer(nlp.vocab)

View File

@ -22,7 +22,6 @@ use_pyvi = true
"""
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
def create_vietnamese_tokenizer(use_pyvi: bool = True):
def vietnamese_tokenizer_factory(nlp):
return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)

View File

@ -46,7 +46,6 @@ class Segmenter(str, Enum):
return list(cls.__members__.keys())
@registry.tokenizers("spacy.zh.ChineseTokenizer")
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
def chinese_tokenizer_factory(nlp):
return ChineseTokenizer(nlp.vocab, segmenter=segmenter)

View File

@ -104,7 +104,6 @@ class BaseDefaults:
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
@registry.tokenizers("spacy.Tokenizer.v1")
def create_tokenizer() -> Callable[["Language"], Tokenizer]:
"""Registered function to create a tokenizer. Returns a factory that takes
the nlp object and returns a Tokenizer instance using the language detaults.
@ -130,7 +129,6 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
return tokenizer_factory
@registry.misc("spacy.LookupsDataLoader.v1")
def load_lookups_data(lang, tables):
util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
lookups = load_lookups(lang=lang, tables=tables)

View File

@ -30,7 +30,6 @@ if TYPE_CHECKING:
from ...vocab import Vocab # noqa: F401
@registry.architectures("spacy.PretrainVectors.v1")
def create_pretrain_vectors(
maxout_pieces: int, hidden_size: int, loss: str
) -> Callable[["Vocab", Model], Model]:
@ -57,7 +56,6 @@ def create_pretrain_vectors(
return create_vectors_objective
@registry.architectures("spacy.PretrainCharacters.v1")
def create_pretrain_characters(
maxout_pieces: int, hidden_size: int, n_characters: int
) -> Callable[["Vocab", Model], Model]:

View File

@ -11,7 +11,6 @@ from .._precomputable_affine import PrecomputableAffine
from ..tb_framework import TransitionModel
@registry.architectures("spacy.TransitionBasedParser.v2")
def build_tb_parser_model(
tok2vec: Model[List[Doc], List[Floats2d]],
state_type: Literal["parser", "ner"],

View File

@ -7,7 +7,6 @@ from ...tokens import Doc
from ...util import registry
@registry.architectures("spacy.Tagger.v2")
def build_tagger_model(
tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None, normalize=False
) -> Model[List[Doc], List[Floats2d]]:

View File

@ -4,7 +4,6 @@ from ..util import registry
from .parser_model import ParserStepModel
@registry.layers("spacy.TransitionModel.v1")
def TransitionModel(
tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
):

View File

@ -66,7 +66,6 @@ def parser_score(examples, **kwargs):
return results
@registry.scorers("spacy.parser_scorer.v1")
def make_parser_scorer():
return parser_score

View File

@ -64,7 +64,6 @@ def morphologizer_score(examples, **kwargs):
return results
@registry.scorers("spacy.morphologizer_scorer.v1")
def make_morphologizer_scorer():
return morphologizer_score

View File

@ -43,6 +43,13 @@ def populate_registry() -> None:
make_preserve_existing_ents_filter,
)
from .pipeline.attributeruler import make_attribute_ruler_scorer
from .pipeline.dep_parser import make_parser_scorer
from .pipeline.morphologizer import make_morphologizer_scorer
from .lang.ja import create_tokenizer as create_japanese_tokenizer
from .lang.zh import create_chinese_tokenizer
from .lang.ko import create_tokenizer as create_korean_tokenizer
from .lang.vi import create_vietnamese_tokenizer
from .lang.th import create_thai_tokenizer
# Import all pipeline components that were using registry decorators
from .pipeline.tagger import make_tagger_scorer
@ -65,6 +72,7 @@ def populate_registry() -> None:
registry.misc("spacy.EmptyKB.v1")(empty_kb)
registry.misc("spacy.CandidateGenerator.v1")(create_candidates)
registry.misc("spacy.CandidateBatchGenerator.v1")(create_candidates_batch)
registry.misc("spacy.LookupsDataLoader.v1")(load_lookups_data)
# Need to get references to the existing functions in registry by importing the function that is there
# For the registry that was previously decorated
@ -109,11 +117,13 @@ def populate_registry() -> None:
)
from .ml.models.span_finder import build_finder_model
from .ml.models.parser import build_tb_parser_model
from .ml.models.multi_task import create_pretrain_vectors
from .ml.models.multi_task import create_pretrain_vectors, create_pretrain_characters
from .ml.models.tagger import build_tagger_model
from .ml.staticvectors import StaticVectors
from .ml._precomputable_affine import PrecomputableAffine
from .ml._character_embed import CharacterEmbed
from .ml.tb_framework import TransitionModel
from .language import create_tokenizer, load_lookups_data
from .matcher.levenshtein import make_levenshtein_compare
from .training.callbacks import create_copy_from_base_model
from .ml.callbacks import create_models_with_nvtx_range, create_models_and_pipes_with_nvtx_range
@ -150,6 +160,15 @@ def populate_registry() -> None:
registry.scorers("spacy.entity_linker_scorer.v1")(make_entity_linker_scorer)
registry.scorers("spacy.overlapping_labeled_spans_scorer.v1")(make_overlapping_labeled_spans_scorer)
registry.scorers("spacy.attribute_ruler_scorer.v1")(make_attribute_ruler_scorer)
registry.scorers("spacy.parser_scorer.v1")(make_parser_scorer)
registry.scorers("spacy.morphologizer_scorer.v1")(make_morphologizer_scorer)
# Register tokenizers
registry.tokenizers("spacy.ja.JapaneseTokenizer")(create_japanese_tokenizer)
registry.tokenizers("spacy.zh.ChineseTokenizer")(create_chinese_tokenizer)
registry.tokenizers("spacy.ko.KoreanTokenizer")(create_korean_tokenizer)
registry.tokenizers("spacy.vi.VietnameseTokenizer")(create_vietnamese_tokenizer)
registry.tokenizers("spacy.th.ThaiTokenizer")(create_thai_tokenizer)
# Register tok2vec architectures we've modified
registry.architectures("spacy.Tok2VecListener.v1")(tok2vec_listener_v1)
@ -172,6 +191,7 @@ def populate_registry() -> None:
registry.architectures("spacy.SpanFinder.v1")(build_finder_model)
registry.architectures("spacy.TransitionBasedParser.v2")(build_tb_parser_model)
registry.architectures("spacy.PretrainVectors.v1")(create_pretrain_vectors)
registry.architectures("spacy.PretrainCharacters.v1")(create_pretrain_characters)
registry.architectures("spacy.Tagger.v2")(build_tagger_model)
# Register layers
@ -183,6 +203,7 @@ def populate_registry() -> None:
registry.layers("spacy.StaticVectors.v2")(StaticVectors)
registry.layers("spacy.PrecomputableAffine.v1")(PrecomputableAffine)
registry.layers("spacy.CharEmbed.v1")(CharacterEmbed)
registry.layers("spacy.TransitionModel.v1")(TransitionModel)
# Register callbacks
registry.callbacks("spacy.copy_from_base_model.v1")(create_copy_from_base_model)