mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
default models defined in component decorator (#5452)
* move defaults to pipeline and use in component decorator * black formatting * relative import
This commit is contained in:
parent
0d94737857
commit
f00de445dd
|
@ -184,33 +184,6 @@ class Language(object):
|
|||
self.max_length = max_length
|
||||
self._optimizer = None
|
||||
|
||||
# TODO: de-uglify (incorporating into component decorator didn't work because of circular imports)
|
||||
from .ml.models.defaults import (
|
||||
default_tagger_config,
|
||||
default_parser_config,
|
||||
default_ner_config,
|
||||
default_textcat_config,
|
||||
default_nel_config,
|
||||
default_morphologizer_config,
|
||||
default_senter_config,
|
||||
default_tensorizer_config,
|
||||
default_tok2vec_config,
|
||||
default_simple_ner_config
|
||||
)
|
||||
|
||||
self.defaults = {
|
||||
"tagger": default_tagger_config(),
|
||||
"parser": default_parser_config(),
|
||||
"ner": default_ner_config(),
|
||||
"textcat": default_textcat_config(),
|
||||
"entity_linker": default_nel_config(),
|
||||
"morphologizer": default_morphologizer_config(),
|
||||
"senter": default_senter_config(),
|
||||
"simple_ner": default_simple_ner_config(),
|
||||
"tensorizer": default_tensorizer_config(),
|
||||
"tok2vec": default_tok2vec_config(),
|
||||
}
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return self._path
|
||||
|
@ -338,7 +311,6 @@ class Language(object):
|
|||
else:
|
||||
raise KeyError(Errors.E002.format(name=name))
|
||||
factory = self.factories[name]
|
||||
default_config = self.defaults.get(name, None)
|
||||
|
||||
# transform the model's config to an actual Model
|
||||
factory_cfg = dict(config)
|
||||
|
@ -349,11 +321,6 @@ class Language(object):
|
|||
warnings.warn(Warnings.W099.format(type=type(model_cfg), pipe=name))
|
||||
model_cfg = None
|
||||
del factory_cfg["model"]
|
||||
if model_cfg is None and default_config is not None:
|
||||
warnings.warn(Warnings.W098.format(name=name))
|
||||
model_cfg = default_config["model"]
|
||||
if model_cfg is None:
|
||||
warnings.warn(Warnings.W097.format(name=name))
|
||||
model = None
|
||||
if model_cfg is not None:
|
||||
self.config[name] = {"model": model_cfg}
|
||||
|
@ -539,7 +506,11 @@ class Language(object):
|
|||
to_disable = [pipe for pipe in self.pipe_names if pipe not in enable]
|
||||
# raise an error if the enable and disable keywords are not consistent
|
||||
if disable is not None and disable != to_disable:
|
||||
raise ValueError(Errors.E992.format(enable=enable, disable=disable, names=self.pipe_names))
|
||||
raise ValueError(
|
||||
Errors.E992.format(
|
||||
enable=enable, disable=disable, names=self.pipe_names
|
||||
)
|
||||
)
|
||||
disable = to_disable
|
||||
return DisabledPipes(self, disable)
|
||||
|
||||
|
@ -1085,7 +1056,14 @@ class component(object):
|
|||
# NB: This decorator needs to live here, because it needs to write to
|
||||
# Language.factories. All other solutions would cause circular import.
|
||||
|
||||
def __init__(self, name=None, assigns=tuple(), requires=tuple(), retokenizes=False):
|
||||
def __init__(
|
||||
self,
|
||||
name=None,
|
||||
assigns=tuple(),
|
||||
requires=tuple(),
|
||||
retokenizes=False,
|
||||
default_model=lambda: None,
|
||||
):
|
||||
"""Decorate a pipeline component.
|
||||
|
||||
name (unicode): Default component and factory name.
|
||||
|
@ -1097,6 +1075,7 @@ class component(object):
|
|||
self.assigns = validate_attrs(assigns)
|
||||
self.requires = validate_attrs(requires)
|
||||
self.retokenizes = retokenizes
|
||||
self.default_model = default_model
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
obj = args[0]
|
||||
|
@ -1109,6 +1088,11 @@ class component(object):
|
|||
obj.retokenizes = self.retokenizes
|
||||
|
||||
def factory(nlp, model, **cfg):
|
||||
if model is None:
|
||||
model = self.default_model()
|
||||
warnings.warn(Warnings.W098.format(name=self.name))
|
||||
if model is None:
|
||||
warnings.warn(Warnings.W097.format(name=self.name))
|
||||
if hasattr(obj, "from_nlp"):
|
||||
return obj.from_nlp(nlp, model, **cfg)
|
||||
elif isinstance(obj, type):
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from pathlib import Path
|
||||
|
||||
from .... import util
|
||||
from ... import util
|
||||
|
||||
|
||||
def default_nel_config():
|
|
@ -17,9 +17,10 @@ from ..util import link_vectors_to_models, create_default_optimizer
|
|||
from ..errors import Errors, TempErrors
|
||||
from .pipes import Tagger, _load_cfg
|
||||
from .. import util
|
||||
from .defaults import default_morphologizer
|
||||
|
||||
|
||||
@component("morphologizer", assigns=["token.morph", "token.pos"])
|
||||
@component("morphologizer", assigns=["token.morph", "token.pos"], default_model=default_morphologizer)
|
||||
class Morphologizer(Tagger):
|
||||
|
||||
def __init__(self, vocab, model, **cfg):
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
import numpy
|
||||
import srsly
|
||||
import random
|
||||
|
||||
from thinc.api import CosineDistance, to_categorical, get_array_module
|
||||
from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy
|
||||
import warnings
|
||||
|
@ -13,6 +14,8 @@ from ..syntax.arc_eager cimport ArcEager
|
|||
from ..morphology cimport Morphology
|
||||
from ..vocab cimport Vocab
|
||||
|
||||
from .defaults import default_tagger, default_parser, default_ner, default_textcat
|
||||
from .defaults import default_nel, default_senter, default_tensorizer
|
||||
from .functions import merge_subtokens
|
||||
from ..language import Language, component
|
||||
from ..syntax import nonproj
|
||||
|
@ -234,7 +237,7 @@ class Pipe(object):
|
|||
return self
|
||||
|
||||
|
||||
@component("tensorizer", assigns=["doc.tensor"])
|
||||
@component("tensorizer", assigns=["doc.tensor"], default_model=default_tensorizer)
|
||||
class Tensorizer(Pipe):
|
||||
"""Pre-train position-sensitive vectors for tokens."""
|
||||
|
||||
|
@ -366,7 +369,7 @@ class Tensorizer(Pipe):
|
|||
return sgd
|
||||
|
||||
|
||||
@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"])
|
||||
@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger)
|
||||
class Tagger(Pipe):
|
||||
"""Pipeline component for part-of-speech tagging.
|
||||
|
||||
|
@ -636,7 +639,7 @@ class Tagger(Pipe):
|
|||
return self
|
||||
|
||||
|
||||
@component("senter", assigns=["token.is_sent_start"])
|
||||
@component("senter", assigns=["token.is_sent_start"], default_model=default_senter)
|
||||
class SentenceRecognizer(Tagger):
|
||||
"""Pipeline component for sentence segmentation.
|
||||
|
||||
|
@ -976,7 +979,7 @@ class ClozeMultitask(Pipe):
|
|||
losses[self.name] += loss
|
||||
|
||||
|
||||
@component("textcat", assigns=["doc.cats"])
|
||||
@component("textcat", assigns=["doc.cats"], default_model=default_textcat)
|
||||
class TextCategorizer(Pipe):
|
||||
"""Pipeline component for text classification.
|
||||
|
||||
|
@ -1227,7 +1230,8 @@ cdef class EntityRecognizer(Parser):
|
|||
@component(
|
||||
"entity_linker",
|
||||
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
||||
assigns=["token.ent_kb_id"]
|
||||
assigns=["token.ent_kb_id"],
|
||||
default_model=default_nel,
|
||||
)
|
||||
class EntityLinker(Pipe):
|
||||
"""Pipeline component for named entity linking.
|
||||
|
@ -1673,8 +1677,19 @@ class Sentencizer(Pipe):
|
|||
|
||||
|
||||
# Cython classes can't be decorated, so we need to add the factories here
|
||||
Language.factories["parser"] = lambda nlp, model, **cfg: DependencyParser.from_nlp(nlp, model, **cfg)
|
||||
Language.factories["ner"] = lambda nlp, model, **cfg: EntityRecognizer.from_nlp(nlp, model, **cfg)
|
||||
Language.factories["parser"] = lambda nlp, model, **cfg: parser_factory(nlp, model, **cfg)
|
||||
Language.factories["ner"] = lambda nlp, model, **cfg: ner_factory(nlp, model, **cfg)
|
||||
|
||||
def parser_factory(nlp, model, **cfg):
|
||||
if model is None:
|
||||
model = default_parser()
|
||||
warnings.warn(Warnings.W098.format(name="parser"))
|
||||
return DependencyParser.from_nlp(nlp, model, **cfg)
|
||||
|
||||
def ner_factory(nlp, model, **cfg):
|
||||
if model is None:
|
||||
model = default_ner()
|
||||
warnings.warn(Warnings.W098.format(name="ner"))
|
||||
return EntityRecognizer.from_nlp(nlp, model, **cfg)
|
||||
|
||||
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
|
||||
|
|
|
@ -2,6 +2,8 @@ from typing import List
|
|||
from thinc.types import Floats2d
|
||||
from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate
|
||||
from thinc.util import to_numpy
|
||||
|
||||
from .defaults import default_simple_ner
|
||||
from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob
|
||||
from ..tokens import Doc
|
||||
from ..language import component
|
||||
|
@ -9,7 +11,7 @@ from ..util import link_vectors_to_models
|
|||
from .pipes import Pipe
|
||||
|
||||
|
||||
@component("simple_ner", assigns=["doc.ents"])
|
||||
@component("simple_ner", assigns=["doc.ents"], default_model=default_simple_ner)
|
||||
class SimpleNER(Pipe):
|
||||
"""Named entity recognition with a tagging model. The model should include
|
||||
validity constraints to ensure that only valid tag sequences are returned."""
|
||||
|
|
|
@ -6,9 +6,10 @@ from ..tokens import Doc
|
|||
from ..vocab import Vocab
|
||||
from ..language import component
|
||||
from ..util import link_vectors_to_models, minibatch, eg2doc
|
||||
from .defaults import default_tok2vec
|
||||
|
||||
|
||||
@component("tok2vec", assigns=["doc.tensor"])
|
||||
@component("tok2vec", assigns=["doc.tensor"], default_model=default_tok2vec)
|
||||
class Tok2Vec(Pipe):
|
||||
@classmethod
|
||||
def from_nlp(cls, nlp, model, **cfg):
|
||||
|
|
|
@ -3,7 +3,7 @@ from spacy.tokens import Span
|
|||
import pytest
|
||||
|
||||
from ..util import get_doc
|
||||
from ...ml.models.defaults import default_ner
|
||||
from spacy.pipeline.defaults import default_ner
|
||||
|
||||
|
||||
def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||
|
|
|
@ -4,7 +4,7 @@ from spacy.attrs import NORM
|
|||
from spacy.gold import GoldParse
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
from spacy.ml.models.defaults import default_parser, default_ner
|
||||
from spacy.pipeline.defaults import default_parser, default_ner
|
||||
from spacy.tokens import Doc
|
||||
from spacy.pipeline import DependencyParser, EntityRecognizer
|
||||
from spacy.util import fix_random_seed
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import pytest
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
from spacy.ml.models.defaults import default_parser
|
||||
from spacy.pipeline.defaults import default_parser
|
||||
from spacy.pipeline import DependencyParser
|
||||
from spacy.tokens import Doc
|
||||
from spacy.gold import GoldParse
|
||||
|
|
|
@ -2,7 +2,7 @@ import pytest
|
|||
|
||||
from spacy import util
|
||||
from spacy.lang.en import English
|
||||
from spacy.ml.models.defaults import default_ner
|
||||
from spacy.pipeline.defaults import default_ner
|
||||
|
||||
from spacy.pipeline import EntityRecognizer, EntityRuler
|
||||
from spacy.vocab import Vocab
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import pytest
|
||||
from spacy.ml.models.defaults import default_parser, default_tok2vec
|
||||
from spacy.pipeline.defaults import default_parser, default_tok2vec
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.syntax.arc_eager import ArcEager
|
||||
from spacy.syntax.nn_parser import Parser
|
||||
|
|
|
@ -2,7 +2,7 @@ import pytest
|
|||
import numpy
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.language import Language
|
||||
from spacy.ml.models.defaults import default_parser
|
||||
from spacy.pipeline.defaults import default_parser
|
||||
from spacy.pipeline import DependencyParser
|
||||
from spacy.syntax.arc_eager import ArcEager
|
||||
from spacy.tokens import Doc
|
||||
|
|
|
@ -4,7 +4,7 @@ from spacy.attrs import NORM
|
|||
from spacy.gold import GoldParse
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
from spacy.ml.models.defaults import default_parser
|
||||
from spacy.pipeline.defaults import default_parser
|
||||
from spacy.tokens import Doc
|
||||
from spacy.pipeline import DependencyParser
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ from spacy.gold import GoldParse
|
|||
from spacy.util import fix_random_seed
|
||||
|
||||
from ..util import make_tempdir
|
||||
from ...ml.models.defaults import default_tok2vec
|
||||
from spacy.pipeline.defaults import default_tok2vec
|
||||
|
||||
TRAIN_DATA = [
|
||||
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
|
||||
|
|
|
@ -10,7 +10,7 @@ from spacy.lang.lex_attrs import is_stop
|
|||
from spacy.vectors import Vectors
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.language import Language
|
||||
from spacy.ml.models.defaults import default_ner, default_tagger
|
||||
from spacy.pipeline.defaults import default_ner, default_tagger
|
||||
from spacy.tokens import Doc, Span, Token
|
||||
from spacy.pipeline import Tagger, EntityRecognizer
|
||||
from spacy.attrs import HEAD, DEP
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import pytest
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.de import German
|
||||
from spacy.ml.models.defaults import default_ner
|
||||
from spacy.pipeline.defaults import default_ner
|
||||
from spacy.pipeline import EntityRuler, EntityRecognizer
|
||||
from spacy.matcher import Matcher, PhraseMatcher
|
||||
from spacy.tokens import Doc
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from spacy.pipeline.pipes import DependencyParser
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
from spacy.ml.models.defaults import default_parser
|
||||
from spacy.pipeline.defaults import default_parser
|
||||
|
||||
|
||||
def test_issue3830_no_subtok():
|
||||
|
|
|
@ -3,7 +3,7 @@ from spacy.pipeline import EntityRecognizer, EntityRuler
|
|||
from spacy.lang.en import English
|
||||
from spacy.tokens import Span
|
||||
from spacy.util import ensure_path
|
||||
from spacy.ml.models.defaults import default_ner
|
||||
from spacy.pipeline.defaults import default_ner
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from collections import defaultdict
|
||||
|
||||
from spacy.ml.models.defaults import default_ner
|
||||
from spacy.pipeline.defaults import default_ner
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
|
||||
from spacy.lang.en import English
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
import pytest
|
||||
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
|
||||
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
|
||||
from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger
|
||||
from spacy.ml.models.defaults import default_textcat, default_senter
|
||||
from spacy.pipeline.defaults import default_parser, default_tensorizer, default_tagger
|
||||
from spacy.pipeline.defaults import default_textcat, default_senter
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user