Component decorator and component analysis (#4517)

* Add work in progress

* Update analysis helpers and component decorator

* Fix porting of docstrings for Python 2

* Fix docstring stuff on Python 2

* Support meta factories when loading model

* Put auto pipeline analysis behind flag for now

* Analyse pipes on remove_pipe and replace_pipe

* Move analysis to root for now

Try to find a better place for it, but it needs to go for now to avoid circular imports

* Simplify decorator

Don't return a wrapped class and instead just write to the object

* Update existing components and factories

* Add condition in factory for classes vs. functions

* Add missing from_nlp classmethods

* Add "retokenizes" to printed overview

* Update assigns/requires declarations of builtins

* Only return data if no_print is enabled

* Use multiline table for overview

* Don't support Span

* Rewrite errors/warnings and move them to spacy.errors
This commit is contained in:
Ines Montani 2019-10-27 13:35:49 +01:00 committed by Matthew Honnibal
parent 1180304449
commit a9c6104047
15 changed files with 492 additions and 53 deletions

View File

@ -4,7 +4,7 @@ preshed>=3.0.2,<3.1.0
thinc>=7.2.0,<7.3.0
blis>=0.4.0,<0.5.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.2.0,<1.1.0
wasabi>=0.3.0,<1.1.0
srsly>=0.1.0,<1.1.0
# Third party dependencies
numpy>=1.15.0

View File

@ -49,7 +49,7 @@ install_requires =
blis>=0.4.0,<0.5.0
plac>=0.9.6,<1.2.0
requests>=2.13.0,<3.0.0
wasabi>=0.2.0,<1.1.0
wasabi>=0.3.0,<1.1.0
srsly>=0.1.0,<1.1.0
pathlib==1.0.1; python_version < "3.4"
importlib_metadata>=0.20; python_version < "3.8"

View File

@ -9,12 +9,14 @@ warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
# These are imported as part of the API
from thinc.neural.util import prefer_gpu, require_gpu
from . import pipeline
from .cli.info import info as cli_info
from .glossary import explain
from .about import __version__
from .errors import Errors, Warnings, deprecation_warning
from . import util
from .util import register_architecture, get_architecture
from .language import component
if sys.maxunicode == 65535:

176
spacy/analysis.py Normal file
View File

@ -0,0 +1,176 @@
# coding: utf8
from __future__ import unicode_literals
from collections import OrderedDict
from wasabi import Printer
from .tokens import Doc, Token
from .errors import Errors, Warnings, user_warning
def analyze_pipes(pipeline, name, pipe, index, warn=True):
"""Analyze a pipeline component with respect to its position in the current
pipeline and the other components. Will check whether requirements are
fulfilled (e.g. if previous components assign the attributes).
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
name (unicode): The name of the pipeline component to analyze.
pipe (callable): The pipeline component function to analyze.
index (int): The index of the component in the pipeline.
warn (bool): Show user warning if problem is found.
RETURNS (list): The problems found for the given pipeline component.
"""
assert pipeline[index][0] == name
prev_pipes = pipeline[:index]
pipe_requires = getattr(pipe, "requires", [])
requires = OrderedDict([(annot, False) for annot in pipe_requires])
if requires:
for prev_name, prev_pipe in prev_pipes:
prev_assigns = getattr(prev_pipe, "assigns", [])
for annot in prev_assigns:
requires[annot] = True
problems = []
for annot, fulfilled in requires.items():
if not fulfilled:
problems.append(annot)
if warn:
user_warning(Warnings.W025.format(name=name, attr=annot))
return problems
def analyze_all_pipes(pipeline, warn=True):
"""Analyze all pipes in the pipeline in order.
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
warn (bool): Show user warning if problem is found.
RETURNS (dict): The problems found, keyed by component name.
"""
problems = {}
for i, (name, pipe) in enumerate(pipeline):
problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn)
return problems
def dot_to_dict(values):
"""Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"]
become {"token": {"pos": True, "_": {"xyz": True }}}.
values (iterable): The values to convert.
RETURNS (dict): The converted values.
"""
result = {}
for value in values:
path = result
parts = value.lower().split(".")
for i, item in enumerate(parts):
is_last = i == len(parts) - 1
path = path.setdefault(item, True if is_last else {})
return result
def validate_attrs(values):
"""Validate component attributes provided to "assigns", "requires" etc.
Raises error for invalid attributes and formatting. Doesn't check if
custom extension attributes are registered, since this is something the
user might want to do themselves later in the component.
values (iterable): The string attributes to check, e.g. `["token.pos"]`.
RETURNS (iterable): The checked attributes.
"""
data = dot_to_dict(values)
objs = {"doc": Doc, "token": Token}
for obj_key, attrs in data.items():
if obj_key not in objs: # first element is not doc/token
if obj_key == "span":
span_attrs = [attr for attr in values if attr.startswith("span.")]
raise ValueError(Errors.E180.format(attrs=", ".join(span_attrs)))
invalid_attrs = ", ".join(a for a in values if a.startswith(obj_key))
raise ValueError(Errors.E181.format(obj=obj_key, attrs=invalid_attrs))
if not isinstance(attrs, dict): # attr is something like "doc"
raise ValueError(Errors.E182.format(attr=obj_key))
for attr, value in attrs.items():
if attr == "_":
if value is True: # attr is something like "doc._"
raise ValueError(Errors.E182.format(attr="{}._".format(obj_key)))
for ext_attr, ext_value in value.items():
# We don't check whether the attribute actually exists
if ext_value is not True: # attr is something like doc._.x.y
good = "{}._.{}".format(obj_key, ext_attr)
bad = "{}.{}".format(good, ".".join(ext_value))
raise ValueError(Errors.E183.format(attr=bad, solution=good))
continue # we can't validate those further
if attr.endswith("_"): # attr is something like "token.pos_"
raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1]))
if value is not True: # attr is something like doc.x.y
good = "{}.{}".format(obj_key, attr)
bad = "{}.{}".format(good, ".".join(value))
raise ValueError(Errors.E183.format(attr=bad, solution=good))
obj = objs[obj_key]
if not hasattr(obj, attr):
raise ValueError(Errors.E185.format(obj=obj_key, attr=attr))
return values
def _get_feature_for_attr(pipeline, attr, feature):
assert feature in ["assigns", "requires"]
result = []
for pipe_name, pipe in pipeline:
pipe_assigns = getattr(pipe, feature, [])
if attr in pipe_assigns:
result.append((pipe_name, pipe))
return result
def get_assigns_for_attr(pipeline, attr):
"""Get all pipeline components that assign an attr, e.g. "doc.tensor".
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
attr (unicode): The attribute to check.
RETURNS (list): (name, pipeline) tuples of components that assign the attr.
"""
return _get_feature_for_attr(pipeline, attr, "assigns")
def get_requires_for_attr(pipeline, attr):
"""Get all pipeline components that require an attr, e.g. "doc.tensor".
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
attr (unicode): The attribute to check.
RETURNS (list): (name, pipeline) tuples of components that require the attr.
"""
return _get_feature_for_attr(pipeline, attr, "requires")
def print_summary(nlp, pretty=True, no_print=False):
"""Print a formatted summary for the current nlp object's pipeline. Shows
a table with the pipeline components and why they assign and require, as
well as any problems if available.
nlp (Language): The nlp object.
pretty (bool): Pretty-print the results (color etc).
no_print (bool): Don't print anything, just return the data.
RETURNS (dict): A dict with "overview" and "problems".
"""
msg = Printer(pretty=pretty, no_print=no_print)
overview = []
problems = {}
for i, (name, pipe) in enumerate(nlp.pipeline):
requires = getattr(pipe, "requires", [])
assigns = getattr(pipe, "assigns", [])
retok = getattr(pipe, "retokenizes", False)
overview.append((i, name, requires, assigns, retok))
problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
msg.divider("Pipeline Overview")
header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
msg.table(overview, header=header, divider=True, multiline=True)
n_problems = sum(len(p) for p in problems.values())
if any(p for p in problems.values()):
msg.divider("Problems ({})".format(n_problems))
for name, problem in problems.items():
if problem:
problem = ", ".join(problem)
msg.warn("'{}' requirements not met: {}".format(name, problem))
else:
msg.good("No problems found.")
if no_print:
return {"overview": overview, "problems": problems}

View File

@ -12,6 +12,7 @@ import os
import sys
import itertools
import ast
import types
from thinc.neural.util import copy_array
@ -67,6 +68,7 @@ if is_python2:
basestring_ = basestring # noqa: F821
input_ = raw_input # noqa: F821
path2str = lambda path: str(path).decode("utf8")
class_types = (type, types.ClassType)
elif is_python3:
bytes_ = bytes
@ -74,6 +76,7 @@ elif is_python3:
basestring_ = str
input_ = input
path2str = lambda path: str(path)
class_types = (type, types.ClassType) if is_python_pre_3_5 else type
def b_to_str(b_str):

View File

@ -99,6 +99,8 @@ class Warnings(object):
"'n_process' will be set to 1.")
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
"the Knowledge Base.")
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
"previous components in the pipeline declare that they assign it.")
@add_codes
@ -511,6 +513,20 @@ class Errors(object):
E179 = ("Invalid pattern. Expected a list of Doc objects but got a single "
"Doc. If you only want to add one pattern, make sure to wrap it "
"in a list. For example: matcher.add('{key}', [doc])")
E180 = ("Span attributes can't be declared as required or assigned by "
"components, since spans are only views of the Doc. Use Doc and "
"Token attributes only and remove the following: {attrs}")
E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
"Only Doc and Token attributes are supported.")
E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
"to define the attribute? For example: {attr}.???")
E183 = ("Received invalid attribute declaration: {attr}\nOnly top-level "
"attributes are supported, for example: {solution}")
E184 = ("Only attributes without underscores are supported in component "
"attribute declarations (because underscore and non-underscore "
"attributes are connected anyways): {attr} -> {solution}")
E185 = ("Received invalid attribute in component attribute declaration: "
"{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
@add_codes

View File

@ -18,13 +18,8 @@ from .tokenizer import Tokenizer
from .vocab import Vocab
from .lemmatizer import Lemmatizer
from .lookups import Lookups
from .pipeline import DependencyParser, Tagger
from .pipeline import Tensorizer, EntityRecognizer, EntityLinker
from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
from .pipeline import EntityRuler
from .pipeline import Morphologizer
from .compat import izip, basestring_, is_python2
from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
from .compat import izip, basestring_, is_python2, class_types
from .gold import GoldParse
from .scorer import Scorer
from ._ml import link_vectors_to_models, create_default_optimizer
@ -40,6 +35,9 @@ from . import util
from . import about
ENABLE_PIPELINE_ANALYSIS = False
class BaseDefaults(object):
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
@ -135,19 +133,6 @@ class Language(object):
factories = {
"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp),
"tensorizer": lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
"tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
"morphologizer": lambda nlp, **cfg: Morphologizer(nlp.vocab, **cfg),
"parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
"ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
"entity_linker": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
"similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
"textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
"sentencizer": lambda nlp, **cfg: Sentencizer(**cfg),
"merge_noun_chunks": lambda nlp, **cfg: merge_noun_chunks,
"merge_entities": lambda nlp, **cfg: merge_entities,
"merge_subtokens": lambda nlp, **cfg: merge_subtokens,
"entity_ruler": lambda nlp, **cfg: EntityRuler(nlp, **cfg),
}
def __init__(
@ -218,6 +203,7 @@ class Language(object):
"name": self.vocab.vectors.name,
}
self._meta["pipeline"] = self.pipe_names
self._meta["factories"] = self.pipe_factories
self._meta["labels"] = self.pipe_labels
return self._meta
@ -259,6 +245,17 @@ class Language(object):
"""
return [pipe_name for pipe_name, _ in self.pipeline]
@property
def pipe_factories(self):
"""Get the component factories for the available pipeline components.
RETURNS (dict): Factory names, keyed by component names.
"""
factories = {}
for pipe_name, pipe in self.pipeline:
factories[pipe_name] = getattr(pipe, "factory", pipe_name)
return factories
@property
def pipe_labels(self):
"""Get the labels set by the pipeline components, if available (if
@ -327,33 +324,30 @@ class Language(object):
msg += Errors.E004.format(component=component)
raise ValueError(msg)
if name is None:
if hasattr(component, "name"):
name = component.name
elif hasattr(component, "__name__"):
name = component.__name__
elif hasattr(component, "__class__") and hasattr(
component.__class__, "__name__"
):
name = component.__class__.__name__
else:
name = repr(component)
name = util.get_component_name(component)
if name in self.pipe_names:
raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names))
if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
raise ValueError(Errors.E006)
pipe_index = 0
pipe = (name, component)
if last or not any([first, before, after]):
pipe_index = len(self.pipeline)
self.pipeline.append(pipe)
elif first:
self.pipeline.insert(0, pipe)
elif before and before in self.pipe_names:
pipe_index = self.pipe_names.index(before)
self.pipeline.insert(self.pipe_names.index(before), pipe)
elif after and after in self.pipe_names:
pipe_index = self.pipe_names.index(after) + 1
self.pipeline.insert(self.pipe_names.index(after) + 1, pipe)
else:
raise ValueError(
Errors.E001.format(name=before or after, opts=self.pipe_names)
)
if ENABLE_PIPELINE_ANALYSIS:
analyze_pipes(self.pipeline, name, component, pipe_index)
def has_pipe(self, name):
"""Check if a component name is present in the pipeline. Equivalent to
@ -382,6 +376,8 @@ class Language(object):
msg += Errors.E135.format(name=name)
raise ValueError(msg)
self.pipeline[self.pipe_names.index(name)] = (name, component)
if ENABLE_PIPELINE_ANALYSIS:
analyze_all_pipes(self.pipeline)
def rename_pipe(self, old_name, new_name):
"""Rename a pipeline component.
@ -408,6 +404,8 @@ class Language(object):
"""
if name not in self.pipe_names:
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
if ENABLE_PIPELINE_ANALYSIS:
analyze_all_pipes(self.pipeline)
return self.pipeline.pop(self.pipe_names.index(name))
def __call__(self, text, disable=[], component_cfg=None):
@ -1001,6 +999,52 @@ class Language(object):
return self
class component(object):
"""Decorator for pipeline components. Can decorate both function components
and class components and will automatically register components in the
Language.factories. If the component is a class and needs access to the
nlp object or config parameters, it can expose a from_nlp classmethod
that takes the nlp object and **cfg arguments and returns the initialized
component.
"""
# NB: This decorator needs to live here, because it needs to write to
# Language.factories. All other solutions would cause circular import.
def __init__(self, name=None, assigns=tuple(), requires=tuple(), retokenizes=False):
"""Decorate a pipeline component.
name (unicode): Default component and factory name.
assigns (list): Attributes assigned by component, e.g. `["token.pos"]`.
requires (list): Attributes required by component, e.g. `["token.dep"]`.
retokenizes (bool): Whether the component changes the tokenization.
"""
self.name = name
self.assigns = validate_attrs(assigns)
self.requires = validate_attrs(requires)
self.retokenizes = retokenizes
def __call__(self, *args, **kwargs):
obj = args[0]
args = args[1:]
factory_name = self.name or util.get_component_name(obj)
obj.name = factory_name
obj.factory = factory_name
obj.assigns = self.assigns
obj.requires = self.requires
obj.retokenizes = self.retokenizes
def factory(nlp, **cfg):
if hasattr(obj, "from_nlp"):
return obj.from_nlp(nlp, **cfg)
elif isinstance(obj, class_types):
return obj()
return obj
Language.factories[obj.factory] = factory
return obj
def _fix_pretrained_vectors_name(nlp):
# TODO: Replace this once we handle vectors consistently as static
# data

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
from collections import defaultdict, OrderedDict
import srsly
from ..language import component
from ..errors import Errors
from ..compat import basestring_
from ..util import ensure_path, to_disk, from_disk
@ -13,6 +14,7 @@ from ..matcher import Matcher, PhraseMatcher
DEFAULT_ENT_ID_SEP = "||"
@component("entity_ruler", assigns=["doc.ents", "token.ent_type", "token.ent_iob"])
class EntityRuler(object):
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
rules or exact phrase matches. It can be combined with the statistical
@ -24,8 +26,6 @@ class EntityRuler(object):
USAGE: https://spacy.io/usage/rule-based-matching#entityruler
"""
name = "entity_ruler"
def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg):
"""Initialize the entitiy ruler. If patterns are supplied here, they
need to be a list of dictionaries with a `"label"` and `"pattern"`
@ -69,6 +69,10 @@ class EntityRuler(object):
if patterns is not None:
self.add_patterns(patterns)
@classmethod
def from_nlp(cls, nlp, **cfg):
return cls(nlp, **cfg)
def __len__(self):
"""The number of all patterns added to the entity ruler."""
n_token_patterns = sum(len(p) for p in self.token_patterns.values())

View File

@ -1,9 +1,15 @@
# coding: utf8
from __future__ import unicode_literals
from ..language import component
from ..matcher import Matcher
@component(
"merge_noun_chunks",
requires=["token.dep", "token.tag", "token.pos"],
retokenizes=True,
)
def merge_noun_chunks(doc):
"""Merge noun chunks into a single token.
@ -21,6 +27,11 @@ def merge_noun_chunks(doc):
return doc
@component(
"merge_entities",
requires=["doc.ents", "token.ent_iob", "token.ent_type"],
retokenizes=True,
)
def merge_entities(doc):
"""Merge entities into a single token.
@ -36,6 +47,7 @@ def merge_entities(doc):
return doc
@component("merge_subtokens", requires=["token.dep"], retokenizes=True)
def merge_subtokens(doc, label="subtok"):
"""Merge subtokens into a single token.

View File

@ -5,9 +5,11 @@ from thinc.t2v import Pooling, max_pool, mean_pool
from thinc.neural._classes.difference import Siamese, CauchySimilarity
from .pipes import Pipe
from ..language import component
from .._ml import link_vectors_to_models
@component("sentencizer_hook", assigns=["doc.user_hooks"])
class SentenceSegmenter(object):
"""A simple spaCy hook, to allow custom sentence boundary detection logic
(that doesn't require the dependency parse). To change the sentence
@ -17,8 +19,6 @@ class SentenceSegmenter(object):
and yield `Span` objects for each sentence.
"""
name = "sentencizer"
def __init__(self, vocab, strategy=None):
self.vocab = vocab
if strategy is None or strategy == "on_punct":
@ -44,6 +44,7 @@ class SentenceSegmenter(object):
yield doc[start : len(doc)]
@component("similarity", assigns=["doc.user_hooks"])
class SimilarityHook(Pipe):
"""
Experimental: A pipeline component to install a hook for supervised
@ -58,8 +59,6 @@ class SimilarityHook(Pipe):
Where W is a vector of dimension weights, initialized to 1.
"""
name = "similarity"
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model

View File

@ -8,6 +8,7 @@ from thinc.api import chain
from thinc.neural.util import to_categorical, copy_array, get_array_module
from .. import util
from .pipes import Pipe
from ..language import component
from .._ml import Tok2Vec, build_morphologizer_model
from .._ml import link_vectors_to_models, zero_init, flatten
from .._ml import create_default_optimizer
@ -18,9 +19,9 @@ from ..vocab cimport Vocab
from ..morphology cimport Morphology
@component("morphologizer", assigns=["token.morph", "token.pos"])
class Morphologizer(Pipe):
name = 'morphologizer'
@classmethod
def Model(cls, **cfg):
if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):

View File

@ -13,7 +13,6 @@ from thinc.misc import LayerNorm
from thinc.neural.util import to_categorical
from thinc.neural.util import get_array_module
from .functions import merge_subtokens
from ..tokens.doc cimport Doc
from ..syntax.nn_parser cimport Parser
from ..syntax.ner cimport BiluoPushDown
@ -21,6 +20,8 @@ from ..syntax.arc_eager cimport ArcEager
from ..morphology cimport Morphology
from ..vocab cimport Vocab
from .functions import merge_subtokens
from ..language import Language, component
from ..syntax import nonproj
from ..attrs import POS, ID
from ..parts_of_speech import X
@ -54,6 +55,10 @@ class Pipe(object):
"""Initialize a model for the pipe."""
raise NotImplementedError
@classmethod
def from_nlp(cls, nlp, **cfg):
return cls(nlp.vocab, **cfg)
def __init__(self, vocab, model=True, **cfg):
"""Create a new pipe instance."""
raise NotImplementedError
@ -223,11 +228,10 @@ class Pipe(object):
return self
@component("tensorizer", assigns=["doc.tensor"])
class Tensorizer(Pipe):
"""Pre-train position-sensitive vectors for tokens."""
name = "tensorizer"
@classmethod
def Model(cls, output_size=300, **cfg):
"""Create a new statistical model for the class.
@ -362,14 +366,13 @@ class Tensorizer(Pipe):
return sgd
@component("tagger", assigns=["token.tag", "token.pos"])
class Tagger(Pipe):
"""Pipeline component for part-of-speech tagging.
DOCS: https://spacy.io/api/tagger
"""
name = "tagger"
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
@ -657,13 +660,12 @@ class Tagger(Pipe):
return self
@component("nn_labeller")
class MultitaskObjective(Tagger):
"""Experimental: Assist training of a parser or tagger, by training a
side-objective.
"""
name = "nn_labeller"
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
self.vocab = vocab
self.model = model
@ -898,12 +900,12 @@ class ClozeMultitask(Pipe):
losses[self.name] += loss
@component("textcat", assigns=["doc.cats"])
class TextCategorizer(Pipe):
"""Pipeline component for text classification.
DOCS: https://spacy.io/api/textcategorizer
"""
name = 'textcat'
@classmethod
def Model(cls, nr_class=1, **cfg):
@ -1051,8 +1053,11 @@ cdef class DependencyParser(Parser):
DOCS: https://spacy.io/api/dependencyparser
"""
# cdef classes can't have decorators, so we're defining this here
name = "parser"
factory = "parser"
assigns = ["token.dep", "token.is_sent_start", "doc.sents"]
requires = []
TransitionSystem = ArcEager
@property
@ -1097,8 +1102,10 @@ cdef class EntityRecognizer(Parser):
DOCS: https://spacy.io/api/entityrecognizer
"""
name = "ner"
factory = "ner"
assigns = ["doc.ents", "token.ent_iob", "token.ent_type"]
requires = []
TransitionSystem = BiluoPushDown
nr_feature = 6
@ -1129,12 +1136,16 @@ cdef class EntityRecognizer(Parser):
return tuple(sorted(labels))
@component(
"entity_linker",
requires=["doc.ents", "token.ent_iob", "token.ent_type"],
assigns=["token.ent_kb_id"]
)
class EntityLinker(Pipe):
"""Pipeline component for named entity linking.
DOCS: https://spacy.io/api/entitylinker
"""
name = 'entity_linker'
NIL = "NIL" # string used to refer to a non-existing link
@classmethod
@ -1405,13 +1416,13 @@ class EntityLinker(Pipe):
raise NotImplementedError
@component("sentencizer", assigns=["token.is_sent_start", "doc.sents"])
class Sentencizer(object):
"""Segment the Doc into sentences using a rule-based strategy.
DOCS: https://spacy.io/api/sentencizer
"""
name = "sentencizer"
default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
'', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '᱿',
@ -1437,6 +1448,10 @@ class Sentencizer(object):
else:
self.punct_chars = set(self.default_punct_chars)
@classmethod
def from_nlp(cls, nlp, **cfg):
return cls(**cfg)
def __call__(self, doc):
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
@ -1503,4 +1518,9 @@ class Sentencizer(object):
return self
# Cython classes can't be decorated, so we need to add the factories here
Language.factories["parser"] = lambda nlp, **cfg: DependencyParser.from_nlp(nlp, **cfg)
Language.factories["ner"] = lambda nlp, **cfg: EntityRecognizer.from_nlp(nlp, **cfg)
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer"]

View File

@ -128,6 +128,10 @@ cdef class Parser:
self._multitasks = []
self._rehearsal_model = None
@classmethod
def from_nlp(cls, nlp, **cfg):
return cls(nlp.vocab, **cfg)
def __reduce__(self):
return (Parser, (self.vocab, self.moves, self.model), None, None)

View File

@ -0,0 +1,146 @@
# coding: utf8
from __future__ import unicode_literals
import spacy.language
from spacy.language import Language, component
from spacy.analysis import print_summary, validate_attrs
from spacy.analysis import get_assigns_for_attr, get_requires_for_attr
from spacy.compat import is_python2
from mock import Mock, ANY
import pytest
def test_component_decorator_function():
@component(name="test")
def test_component(doc):
"""docstring"""
return doc
assert test_component.name == "test"
if not is_python2:
assert test_component.__doc__ == "docstring"
assert test_component("foo") == "foo"
def test_component_decorator_class():
@component(name="test")
class TestComponent(object):
"""docstring1"""
foo = "bar"
def __call__(self, doc):
"""docstring2"""
return doc
def custom(self, x):
"""docstring3"""
return x
assert TestComponent.name == "test"
assert TestComponent.foo == "bar"
assert hasattr(TestComponent, "custom")
test_component = TestComponent()
assert test_component.foo == "bar"
assert test_component("foo") == "foo"
assert hasattr(test_component, "custom")
assert test_component.custom("bar") == "bar"
if not is_python2:
assert TestComponent.__doc__ == "docstring1"
assert TestComponent.__call__.__doc__ == "docstring2"
assert TestComponent.custom.__doc__ == "docstring3"
assert test_component.__doc__ == "docstring1"
assert test_component.__call__.__doc__ == "docstring2"
assert test_component.custom.__doc__ == "docstring3"
def test_component_decorator_assigns():
spacy.language.ENABLE_PIPELINE_ANALYSIS = True
@component("c1", assigns=["token.tag", "doc.tensor"])
def test_component1(doc):
return doc
@component(
"c2", requires=["token.tag", "token.pos"], assigns=["token.lemma", "doc.tensor"]
)
def test_component2(doc):
return doc
@component("c3", requires=["token.lemma"], assigns=["token._.custom_lemma"])
def test_component3(doc):
return doc
assert "c1" in Language.factories
assert "c2" in Language.factories
assert "c3" in Language.factories
nlp = Language()
nlp.add_pipe(test_component1)
with pytest.warns(UserWarning):
nlp.add_pipe(test_component2)
nlp.add_pipe(test_component3)
assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
assert [name for name, _ in assigns_tensor] == ["c1", "c2"]
test_component4 = nlp.create_pipe("c1")
assert test_component4.name == "c1"
assert test_component4.factory == "c1"
nlp.add_pipe(test_component4, name="c4")
assert nlp.pipe_names == ["c1", "c2", "c3", "c4"]
assert "c4" not in Language.factories
assert nlp.pipe_factories["c1"] == "c1"
assert nlp.pipe_factories["c4"] == "c1"
assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
assert [name for name, _ in assigns_tensor] == ["c1", "c2", "c4"]
requires_pos = get_requires_for_attr(nlp.pipeline, "token.pos")
assert [name for name, _ in requires_pos] == ["c2"]
assert print_summary(nlp, no_print=True)
assert nlp("hello world")
def test_component_factories_from_nlp():
"""Test that class components can implement a from_nlp classmethod that
gives them access to the nlp object and config via the factory."""
class TestComponent5(object):
def __call__(self, doc):
return doc
mock = Mock()
mock.return_value = TestComponent5()
TestComponent5.from_nlp = classmethod(mock)
TestComponent5 = component("c5")(TestComponent5)
assert "c5" in Language.factories
nlp = Language()
pipe = nlp.create_pipe("c5", config={"foo": "bar"})
nlp.add_pipe(pipe)
assert nlp("hello world")
# The first argument here is the class itself, so we're accepting any here
mock.assert_called_once_with(ANY, nlp, foo="bar")
def test_analysis_validate_attrs_valid():
attrs = ["doc.sents", "doc.ents", "token.tag", "token._.xyz"]
assert validate_attrs(attrs)
for attr in attrs:
assert validate_attrs([attr])
with pytest.raises(ValueError):
validate_attrs(["doc.sents", "doc.xyz"])
@pytest.mark.parametrize(
"attr",
[
"doc",
"doc_ents",
"doc.xyz",
"token.xyz",
"token.tag_",
"token.tag.xyz",
"token._.xyz.abc",
],
)
def test_analysis_validate_attrs_invalid(attr):
with pytest.raises(ValueError):
validate_attrs([attr])

View File

@ -247,6 +247,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
cls = get_lang_class(lang)
nlp = cls(meta=meta, **overrides)
pipeline = meta.get("pipeline", [])
factories = meta.get("factories", {})
disable = overrides.get("disable", [])
if pipeline is True:
pipeline = nlp.Defaults.pipe_names
@ -255,7 +256,8 @@ def load_model_from_path(model_path, meta=False, **overrides):
for name in pipeline:
if name not in disable:
config = meta.get("pipeline_args", {}).get(name, {})
component = nlp.create_pipe(name, config=config)
factory = factories.get(name, name)
component = nlp.create_pipe(factory, config=config)
nlp.add_pipe(component, name=name)
return nlp.from_disk(model_path)
@ -368,6 +370,16 @@ def is_in_jupyter():
return False
def get_component_name(component):
if hasattr(component, "name"):
return component.name
if hasattr(component, "__name__"):
return component.__name__
if hasattr(component, "__class__") and hasattr(component.__class__, "__name__"):
return component.__class__.__name__
return repr(component)
def get_cuda_stream(require=False):
if CudaStream is None:
return None