mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
Component decorator and component analysis (#4517)
* Add work in progress * Update analysis helpers and component decorator * Fix porting of docstrings for Python 2 * Fix docstring stuff on Python 2 * Support meta factories when loading model * Put auto pipeline analysis behind flag for now * Analyse pipes on remove_pipe and replace_pipe * Move analysis to root for now Try to find a better place for it, but it needs to go for now to avoid circular imports * Simplify decorator Don't return a wrapped class and instead just write to the object * Update existing components and factories * Add condition in factory for classes vs. functions * Add missing from_nlp classmethods * Add "retokenizes" to printed overview * Update assigns/requires declarations of builtins * Only return data if no_print is enabled * Use multiline table for overview * Don't support Span * Rewrite errors/warnings and move them to spacy.errors
This commit is contained in:
parent
1180304449
commit
a9c6104047
|
@ -4,7 +4,7 @@ preshed>=3.0.2,<3.1.0
|
||||||
thinc>=7.2.0,<7.3.0
|
thinc>=7.2.0,<7.3.0
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.2.0,<1.1.0
|
wasabi>=0.3.0,<1.1.0
|
||||||
srsly>=0.1.0,<1.1.0
|
srsly>=0.1.0,<1.1.0
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
|
|
|
@ -49,7 +49,7 @@ install_requires =
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
plac>=0.9.6,<1.2.0
|
plac>=0.9.6,<1.2.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
wasabi>=0.2.0,<1.1.0
|
wasabi>=0.3.0,<1.1.0
|
||||||
srsly>=0.1.0,<1.1.0
|
srsly>=0.1.0,<1.1.0
|
||||||
pathlib==1.0.1; python_version < "3.4"
|
pathlib==1.0.1; python_version < "3.4"
|
||||||
importlib_metadata>=0.20; python_version < "3.8"
|
importlib_metadata>=0.20; python_version < "3.8"
|
||||||
|
|
|
@ -9,12 +9,14 @@ warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
|
||||||
# These are imported as part of the API
|
# These are imported as part of the API
|
||||||
from thinc.neural.util import prefer_gpu, require_gpu
|
from thinc.neural.util import prefer_gpu, require_gpu
|
||||||
|
|
||||||
|
from . import pipeline
|
||||||
from .cli.info import info as cli_info
|
from .cli.info import info as cli_info
|
||||||
from .glossary import explain
|
from .glossary import explain
|
||||||
from .about import __version__
|
from .about import __version__
|
||||||
from .errors import Errors, Warnings, deprecation_warning
|
from .errors import Errors, Warnings, deprecation_warning
|
||||||
from . import util
|
from . import util
|
||||||
from .util import register_architecture, get_architecture
|
from .util import register_architecture, get_architecture
|
||||||
|
from .language import component
|
||||||
|
|
||||||
|
|
||||||
if sys.maxunicode == 65535:
|
if sys.maxunicode == 65535:
|
||||||
|
|
176
spacy/analysis.py
Normal file
176
spacy/analysis.py
Normal file
|
@ -0,0 +1,176 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from collections import OrderedDict
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
|
from .tokens import Doc, Token
|
||||||
|
from .errors import Errors, Warnings, user_warning
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_pipes(pipeline, name, pipe, index, warn=True):
|
||||||
|
"""Analyze a pipeline component with respect to its position in the current
|
||||||
|
pipeline and the other components. Will check whether requirements are
|
||||||
|
fulfilled (e.g. if previous components assign the attributes).
|
||||||
|
|
||||||
|
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||||
|
name (unicode): The name of the pipeline component to analyze.
|
||||||
|
pipe (callable): The pipeline component function to analyze.
|
||||||
|
index (int): The index of the component in the pipeline.
|
||||||
|
warn (bool): Show user warning if problem is found.
|
||||||
|
RETURNS (list): The problems found for the given pipeline component.
|
||||||
|
"""
|
||||||
|
assert pipeline[index][0] == name
|
||||||
|
prev_pipes = pipeline[:index]
|
||||||
|
pipe_requires = getattr(pipe, "requires", [])
|
||||||
|
requires = OrderedDict([(annot, False) for annot in pipe_requires])
|
||||||
|
if requires:
|
||||||
|
for prev_name, prev_pipe in prev_pipes:
|
||||||
|
prev_assigns = getattr(prev_pipe, "assigns", [])
|
||||||
|
for annot in prev_assigns:
|
||||||
|
requires[annot] = True
|
||||||
|
problems = []
|
||||||
|
for annot, fulfilled in requires.items():
|
||||||
|
if not fulfilled:
|
||||||
|
problems.append(annot)
|
||||||
|
if warn:
|
||||||
|
user_warning(Warnings.W025.format(name=name, attr=annot))
|
||||||
|
return problems
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_all_pipes(pipeline, warn=True):
|
||||||
|
"""Analyze all pipes in the pipeline in order.
|
||||||
|
|
||||||
|
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||||
|
warn (bool): Show user warning if problem is found.
|
||||||
|
RETURNS (dict): The problems found, keyed by component name.
|
||||||
|
"""
|
||||||
|
problems = {}
|
||||||
|
for i, (name, pipe) in enumerate(pipeline):
|
||||||
|
problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn)
|
||||||
|
return problems
|
||||||
|
|
||||||
|
|
||||||
|
def dot_to_dict(values):
|
||||||
|
"""Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"]
|
||||||
|
become {"token": {"pos": True, "_": {"xyz": True }}}.
|
||||||
|
|
||||||
|
values (iterable): The values to convert.
|
||||||
|
RETURNS (dict): The converted values.
|
||||||
|
"""
|
||||||
|
result = {}
|
||||||
|
for value in values:
|
||||||
|
path = result
|
||||||
|
parts = value.lower().split(".")
|
||||||
|
for i, item in enumerate(parts):
|
||||||
|
is_last = i == len(parts) - 1
|
||||||
|
path = path.setdefault(item, True if is_last else {})
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def validate_attrs(values):
|
||||||
|
"""Validate component attributes provided to "assigns", "requires" etc.
|
||||||
|
Raises error for invalid attributes and formatting. Doesn't check if
|
||||||
|
custom extension attributes are registered, since this is something the
|
||||||
|
user might want to do themselves later in the component.
|
||||||
|
|
||||||
|
values (iterable): The string attributes to check, e.g. `["token.pos"]`.
|
||||||
|
RETURNS (iterable): The checked attributes.
|
||||||
|
"""
|
||||||
|
data = dot_to_dict(values)
|
||||||
|
objs = {"doc": Doc, "token": Token}
|
||||||
|
for obj_key, attrs in data.items():
|
||||||
|
if obj_key not in objs: # first element is not doc/token
|
||||||
|
if obj_key == "span":
|
||||||
|
span_attrs = [attr for attr in values if attr.startswith("span.")]
|
||||||
|
raise ValueError(Errors.E180.format(attrs=", ".join(span_attrs)))
|
||||||
|
invalid_attrs = ", ".join(a for a in values if a.startswith(obj_key))
|
||||||
|
raise ValueError(Errors.E181.format(obj=obj_key, attrs=invalid_attrs))
|
||||||
|
if not isinstance(attrs, dict): # attr is something like "doc"
|
||||||
|
raise ValueError(Errors.E182.format(attr=obj_key))
|
||||||
|
for attr, value in attrs.items():
|
||||||
|
if attr == "_":
|
||||||
|
if value is True: # attr is something like "doc._"
|
||||||
|
raise ValueError(Errors.E182.format(attr="{}._".format(obj_key)))
|
||||||
|
for ext_attr, ext_value in value.items():
|
||||||
|
# We don't check whether the attribute actually exists
|
||||||
|
if ext_value is not True: # attr is something like doc._.x.y
|
||||||
|
good = "{}._.{}".format(obj_key, ext_attr)
|
||||||
|
bad = "{}.{}".format(good, ".".join(ext_value))
|
||||||
|
raise ValueError(Errors.E183.format(attr=bad, solution=good))
|
||||||
|
continue # we can't validate those further
|
||||||
|
if attr.endswith("_"): # attr is something like "token.pos_"
|
||||||
|
raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1]))
|
||||||
|
if value is not True: # attr is something like doc.x.y
|
||||||
|
good = "{}.{}".format(obj_key, attr)
|
||||||
|
bad = "{}.{}".format(good, ".".join(value))
|
||||||
|
raise ValueError(Errors.E183.format(attr=bad, solution=good))
|
||||||
|
obj = objs[obj_key]
|
||||||
|
if not hasattr(obj, attr):
|
||||||
|
raise ValueError(Errors.E185.format(obj=obj_key, attr=attr))
|
||||||
|
return values
|
||||||
|
|
||||||
|
|
||||||
|
def _get_feature_for_attr(pipeline, attr, feature):
|
||||||
|
assert feature in ["assigns", "requires"]
|
||||||
|
result = []
|
||||||
|
for pipe_name, pipe in pipeline:
|
||||||
|
pipe_assigns = getattr(pipe, feature, [])
|
||||||
|
if attr in pipe_assigns:
|
||||||
|
result.append((pipe_name, pipe))
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def get_assigns_for_attr(pipeline, attr):
|
||||||
|
"""Get all pipeline components that assign an attr, e.g. "doc.tensor".
|
||||||
|
|
||||||
|
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||||
|
attr (unicode): The attribute to check.
|
||||||
|
RETURNS (list): (name, pipeline) tuples of components that assign the attr.
|
||||||
|
"""
|
||||||
|
return _get_feature_for_attr(pipeline, attr, "assigns")
|
||||||
|
|
||||||
|
|
||||||
|
def get_requires_for_attr(pipeline, attr):
|
||||||
|
"""Get all pipeline components that require an attr, e.g. "doc.tensor".
|
||||||
|
|
||||||
|
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||||
|
attr (unicode): The attribute to check.
|
||||||
|
RETURNS (list): (name, pipeline) tuples of components that require the attr.
|
||||||
|
"""
|
||||||
|
return _get_feature_for_attr(pipeline, attr, "requires")
|
||||||
|
|
||||||
|
|
||||||
|
def print_summary(nlp, pretty=True, no_print=False):
|
||||||
|
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
||||||
|
a table with the pipeline components and why they assign and require, as
|
||||||
|
well as any problems if available.
|
||||||
|
|
||||||
|
nlp (Language): The nlp object.
|
||||||
|
pretty (bool): Pretty-print the results (color etc).
|
||||||
|
no_print (bool): Don't print anything, just return the data.
|
||||||
|
RETURNS (dict): A dict with "overview" and "problems".
|
||||||
|
"""
|
||||||
|
msg = Printer(pretty=pretty, no_print=no_print)
|
||||||
|
overview = []
|
||||||
|
problems = {}
|
||||||
|
for i, (name, pipe) in enumerate(nlp.pipeline):
|
||||||
|
requires = getattr(pipe, "requires", [])
|
||||||
|
assigns = getattr(pipe, "assigns", [])
|
||||||
|
retok = getattr(pipe, "retokenizes", False)
|
||||||
|
overview.append((i, name, requires, assigns, retok))
|
||||||
|
problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
|
||||||
|
msg.divider("Pipeline Overview")
|
||||||
|
header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
|
||||||
|
msg.table(overview, header=header, divider=True, multiline=True)
|
||||||
|
n_problems = sum(len(p) for p in problems.values())
|
||||||
|
if any(p for p in problems.values()):
|
||||||
|
msg.divider("Problems ({})".format(n_problems))
|
||||||
|
for name, problem in problems.items():
|
||||||
|
if problem:
|
||||||
|
problem = ", ".join(problem)
|
||||||
|
msg.warn("'{}' requirements not met: {}".format(name, problem))
|
||||||
|
else:
|
||||||
|
msg.good("No problems found.")
|
||||||
|
if no_print:
|
||||||
|
return {"overview": overview, "problems": problems}
|
|
@ -12,6 +12,7 @@ import os
|
||||||
import sys
|
import sys
|
||||||
import itertools
|
import itertools
|
||||||
import ast
|
import ast
|
||||||
|
import types
|
||||||
|
|
||||||
from thinc.neural.util import copy_array
|
from thinc.neural.util import copy_array
|
||||||
|
|
||||||
|
@ -67,6 +68,7 @@ if is_python2:
|
||||||
basestring_ = basestring # noqa: F821
|
basestring_ = basestring # noqa: F821
|
||||||
input_ = raw_input # noqa: F821
|
input_ = raw_input # noqa: F821
|
||||||
path2str = lambda path: str(path).decode("utf8")
|
path2str = lambda path: str(path).decode("utf8")
|
||||||
|
class_types = (type, types.ClassType)
|
||||||
|
|
||||||
elif is_python3:
|
elif is_python3:
|
||||||
bytes_ = bytes
|
bytes_ = bytes
|
||||||
|
@ -74,6 +76,7 @@ elif is_python3:
|
||||||
basestring_ = str
|
basestring_ = str
|
||||||
input_ = input
|
input_ = input
|
||||||
path2str = lambda path: str(path)
|
path2str = lambda path: str(path)
|
||||||
|
class_types = (type, types.ClassType) if is_python_pre_3_5 else type
|
||||||
|
|
||||||
|
|
||||||
def b_to_str(b_str):
|
def b_to_str(b_str):
|
||||||
|
|
|
@ -99,6 +99,8 @@ class Warnings(object):
|
||||||
"'n_process' will be set to 1.")
|
"'n_process' will be set to 1.")
|
||||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||||
"the Knowledge Base.")
|
"the Knowledge Base.")
|
||||||
|
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
|
||||||
|
"previous components in the pipeline declare that they assign it.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
@ -511,6 +513,20 @@ class Errors(object):
|
||||||
E179 = ("Invalid pattern. Expected a list of Doc objects but got a single "
|
E179 = ("Invalid pattern. Expected a list of Doc objects but got a single "
|
||||||
"Doc. If you only want to add one pattern, make sure to wrap it "
|
"Doc. If you only want to add one pattern, make sure to wrap it "
|
||||||
"in a list. For example: matcher.add('{key}', [doc])")
|
"in a list. For example: matcher.add('{key}', [doc])")
|
||||||
|
E180 = ("Span attributes can't be declared as required or assigned by "
|
||||||
|
"components, since spans are only views of the Doc. Use Doc and "
|
||||||
|
"Token attributes only and remove the following: {attrs}")
|
||||||
|
E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
|
||||||
|
"Only Doc and Token attributes are supported.")
|
||||||
|
E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
|
||||||
|
"to define the attribute? For example: {attr}.???")
|
||||||
|
E183 = ("Received invalid attribute declaration: {attr}\nOnly top-level "
|
||||||
|
"attributes are supported, for example: {solution}")
|
||||||
|
E184 = ("Only attributes without underscores are supported in component "
|
||||||
|
"attribute declarations (because underscore and non-underscore "
|
||||||
|
"attributes are connected anyways): {attr} -> {solution}")
|
||||||
|
E185 = ("Received invalid attribute in component attribute declaration: "
|
||||||
|
"{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -18,13 +18,8 @@ from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .lookups import Lookups
|
from .lookups import Lookups
|
||||||
from .pipeline import DependencyParser, Tagger
|
from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
|
||||||
from .pipeline import Tensorizer, EntityRecognizer, EntityLinker
|
from .compat import izip, basestring_, is_python2, class_types
|
||||||
from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
|
|
||||||
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
|
|
||||||
from .pipeline import EntityRuler
|
|
||||||
from .pipeline import Morphologizer
|
|
||||||
from .compat import izip, basestring_, is_python2
|
|
||||||
from .gold import GoldParse
|
from .gold import GoldParse
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from ._ml import link_vectors_to_models, create_default_optimizer
|
from ._ml import link_vectors_to_models, create_default_optimizer
|
||||||
|
@ -40,6 +35,9 @@ from . import util
|
||||||
from . import about
|
from . import about
|
||||||
|
|
||||||
|
|
||||||
|
ENABLE_PIPELINE_ANALYSIS = False
|
||||||
|
|
||||||
|
|
||||||
class BaseDefaults(object):
|
class BaseDefaults(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||||
|
@ -135,19 +133,6 @@ class Language(object):
|
||||||
|
|
||||||
factories = {
|
factories = {
|
||||||
"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp),
|
"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp),
|
||||||
"tensorizer": lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
|
|
||||||
"tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
|
|
||||||
"morphologizer": lambda nlp, **cfg: Morphologizer(nlp.vocab, **cfg),
|
|
||||||
"parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
|
|
||||||
"ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
|
|
||||||
"entity_linker": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
|
|
||||||
"similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
|
|
||||||
"textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
|
|
||||||
"sentencizer": lambda nlp, **cfg: Sentencizer(**cfg),
|
|
||||||
"merge_noun_chunks": lambda nlp, **cfg: merge_noun_chunks,
|
|
||||||
"merge_entities": lambda nlp, **cfg: merge_entities,
|
|
||||||
"merge_subtokens": lambda nlp, **cfg: merge_subtokens,
|
|
||||||
"entity_ruler": lambda nlp, **cfg: EntityRuler(nlp, **cfg),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -218,6 +203,7 @@ class Language(object):
|
||||||
"name": self.vocab.vectors.name,
|
"name": self.vocab.vectors.name,
|
||||||
}
|
}
|
||||||
self._meta["pipeline"] = self.pipe_names
|
self._meta["pipeline"] = self.pipe_names
|
||||||
|
self._meta["factories"] = self.pipe_factories
|
||||||
self._meta["labels"] = self.pipe_labels
|
self._meta["labels"] = self.pipe_labels
|
||||||
return self._meta
|
return self._meta
|
||||||
|
|
||||||
|
@ -259,6 +245,17 @@ class Language(object):
|
||||||
"""
|
"""
|
||||||
return [pipe_name for pipe_name, _ in self.pipeline]
|
return [pipe_name for pipe_name, _ in self.pipeline]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def pipe_factories(self):
|
||||||
|
"""Get the component factories for the available pipeline components.
|
||||||
|
|
||||||
|
RETURNS (dict): Factory names, keyed by component names.
|
||||||
|
"""
|
||||||
|
factories = {}
|
||||||
|
for pipe_name, pipe in self.pipeline:
|
||||||
|
factories[pipe_name] = getattr(pipe, "factory", pipe_name)
|
||||||
|
return factories
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def pipe_labels(self):
|
def pipe_labels(self):
|
||||||
"""Get the labels set by the pipeline components, if available (if
|
"""Get the labels set by the pipeline components, if available (if
|
||||||
|
@ -327,33 +324,30 @@ class Language(object):
|
||||||
msg += Errors.E004.format(component=component)
|
msg += Errors.E004.format(component=component)
|
||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
if name is None:
|
if name is None:
|
||||||
if hasattr(component, "name"):
|
name = util.get_component_name(component)
|
||||||
name = component.name
|
|
||||||
elif hasattr(component, "__name__"):
|
|
||||||
name = component.__name__
|
|
||||||
elif hasattr(component, "__class__") and hasattr(
|
|
||||||
component.__class__, "__name__"
|
|
||||||
):
|
|
||||||
name = component.__class__.__name__
|
|
||||||
else:
|
|
||||||
name = repr(component)
|
|
||||||
if name in self.pipe_names:
|
if name in self.pipe_names:
|
||||||
raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names))
|
raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names))
|
||||||
if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
|
if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
|
||||||
raise ValueError(Errors.E006)
|
raise ValueError(Errors.E006)
|
||||||
|
pipe_index = 0
|
||||||
pipe = (name, component)
|
pipe = (name, component)
|
||||||
if last or not any([first, before, after]):
|
if last or not any([first, before, after]):
|
||||||
|
pipe_index = len(self.pipeline)
|
||||||
self.pipeline.append(pipe)
|
self.pipeline.append(pipe)
|
||||||
elif first:
|
elif first:
|
||||||
self.pipeline.insert(0, pipe)
|
self.pipeline.insert(0, pipe)
|
||||||
elif before and before in self.pipe_names:
|
elif before and before in self.pipe_names:
|
||||||
|
pipe_index = self.pipe_names.index(before)
|
||||||
self.pipeline.insert(self.pipe_names.index(before), pipe)
|
self.pipeline.insert(self.pipe_names.index(before), pipe)
|
||||||
elif after and after in self.pipe_names:
|
elif after and after in self.pipe_names:
|
||||||
|
pipe_index = self.pipe_names.index(after) + 1
|
||||||
self.pipeline.insert(self.pipe_names.index(after) + 1, pipe)
|
self.pipeline.insert(self.pipe_names.index(after) + 1, pipe)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
Errors.E001.format(name=before or after, opts=self.pipe_names)
|
Errors.E001.format(name=before or after, opts=self.pipe_names)
|
||||||
)
|
)
|
||||||
|
if ENABLE_PIPELINE_ANALYSIS:
|
||||||
|
analyze_pipes(self.pipeline, name, component, pipe_index)
|
||||||
|
|
||||||
def has_pipe(self, name):
|
def has_pipe(self, name):
|
||||||
"""Check if a component name is present in the pipeline. Equivalent to
|
"""Check if a component name is present in the pipeline. Equivalent to
|
||||||
|
@ -382,6 +376,8 @@ class Language(object):
|
||||||
msg += Errors.E135.format(name=name)
|
msg += Errors.E135.format(name=name)
|
||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
self.pipeline[self.pipe_names.index(name)] = (name, component)
|
self.pipeline[self.pipe_names.index(name)] = (name, component)
|
||||||
|
if ENABLE_PIPELINE_ANALYSIS:
|
||||||
|
analyze_all_pipes(self.pipeline)
|
||||||
|
|
||||||
def rename_pipe(self, old_name, new_name):
|
def rename_pipe(self, old_name, new_name):
|
||||||
"""Rename a pipeline component.
|
"""Rename a pipeline component.
|
||||||
|
@ -408,6 +404,8 @@ class Language(object):
|
||||||
"""
|
"""
|
||||||
if name not in self.pipe_names:
|
if name not in self.pipe_names:
|
||||||
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
|
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
|
||||||
|
if ENABLE_PIPELINE_ANALYSIS:
|
||||||
|
analyze_all_pipes(self.pipeline)
|
||||||
return self.pipeline.pop(self.pipe_names.index(name))
|
return self.pipeline.pop(self.pipe_names.index(name))
|
||||||
|
|
||||||
def __call__(self, text, disable=[], component_cfg=None):
|
def __call__(self, text, disable=[], component_cfg=None):
|
||||||
|
@ -1001,6 +999,52 @@ class Language(object):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
class component(object):
|
||||||
|
"""Decorator for pipeline components. Can decorate both function components
|
||||||
|
and class components and will automatically register components in the
|
||||||
|
Language.factories. If the component is a class and needs access to the
|
||||||
|
nlp object or config parameters, it can expose a from_nlp classmethod
|
||||||
|
that takes the nlp object and **cfg arguments and returns the initialized
|
||||||
|
component.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# NB: This decorator needs to live here, because it needs to write to
|
||||||
|
# Language.factories. All other solutions would cause circular import.
|
||||||
|
|
||||||
|
def __init__(self, name=None, assigns=tuple(), requires=tuple(), retokenizes=False):
|
||||||
|
"""Decorate a pipeline component.
|
||||||
|
|
||||||
|
name (unicode): Default component and factory name.
|
||||||
|
assigns (list): Attributes assigned by component, e.g. `["token.pos"]`.
|
||||||
|
requires (list): Attributes required by component, e.g. `["token.dep"]`.
|
||||||
|
retokenizes (bool): Whether the component changes the tokenization.
|
||||||
|
"""
|
||||||
|
self.name = name
|
||||||
|
self.assigns = validate_attrs(assigns)
|
||||||
|
self.requires = validate_attrs(requires)
|
||||||
|
self.retokenizes = retokenizes
|
||||||
|
|
||||||
|
def __call__(self, *args, **kwargs):
|
||||||
|
obj = args[0]
|
||||||
|
args = args[1:]
|
||||||
|
factory_name = self.name or util.get_component_name(obj)
|
||||||
|
obj.name = factory_name
|
||||||
|
obj.factory = factory_name
|
||||||
|
obj.assigns = self.assigns
|
||||||
|
obj.requires = self.requires
|
||||||
|
obj.retokenizes = self.retokenizes
|
||||||
|
|
||||||
|
def factory(nlp, **cfg):
|
||||||
|
if hasattr(obj, "from_nlp"):
|
||||||
|
return obj.from_nlp(nlp, **cfg)
|
||||||
|
elif isinstance(obj, class_types):
|
||||||
|
return obj()
|
||||||
|
return obj
|
||||||
|
|
||||||
|
Language.factories[obj.factory] = factory
|
||||||
|
return obj
|
||||||
|
|
||||||
|
|
||||||
def _fix_pretrained_vectors_name(nlp):
|
def _fix_pretrained_vectors_name(nlp):
|
||||||
# TODO: Replace this once we handle vectors consistently as static
|
# TODO: Replace this once we handle vectors consistently as static
|
||||||
# data
|
# data
|
||||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
||||||
from collections import defaultdict, OrderedDict
|
from collections import defaultdict, OrderedDict
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
from ..language import component
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..compat import basestring_
|
from ..compat import basestring_
|
||||||
from ..util import ensure_path, to_disk, from_disk
|
from ..util import ensure_path, to_disk, from_disk
|
||||||
|
@ -13,6 +14,7 @@ from ..matcher import Matcher, PhraseMatcher
|
||||||
DEFAULT_ENT_ID_SEP = "||"
|
DEFAULT_ENT_ID_SEP = "||"
|
||||||
|
|
||||||
|
|
||||||
|
@component("entity_ruler", assigns=["doc.ents", "token.ent_type", "token.ent_iob"])
|
||||||
class EntityRuler(object):
|
class EntityRuler(object):
|
||||||
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
|
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
|
||||||
rules or exact phrase matches. It can be combined with the statistical
|
rules or exact phrase matches. It can be combined with the statistical
|
||||||
|
@ -24,8 +26,6 @@ class EntityRuler(object):
|
||||||
USAGE: https://spacy.io/usage/rule-based-matching#entityruler
|
USAGE: https://spacy.io/usage/rule-based-matching#entityruler
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name = "entity_ruler"
|
|
||||||
|
|
||||||
def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg):
|
def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg):
|
||||||
"""Initialize the entitiy ruler. If patterns are supplied here, they
|
"""Initialize the entitiy ruler. If patterns are supplied here, they
|
||||||
need to be a list of dictionaries with a `"label"` and `"pattern"`
|
need to be a list of dictionaries with a `"label"` and `"pattern"`
|
||||||
|
@ -69,6 +69,10 @@ class EntityRuler(object):
|
||||||
if patterns is not None:
|
if patterns is not None:
|
||||||
self.add_patterns(patterns)
|
self.add_patterns(patterns)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_nlp(cls, nlp, **cfg):
|
||||||
|
return cls(nlp, **cfg)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The number of all patterns added to the entity ruler."""
|
"""The number of all patterns added to the entity ruler."""
|
||||||
n_token_patterns = sum(len(p) for p in self.token_patterns.values())
|
n_token_patterns = sum(len(p) for p in self.token_patterns.values())
|
||||||
|
|
|
@ -1,9 +1,15 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..language import component
|
||||||
from ..matcher import Matcher
|
from ..matcher import Matcher
|
||||||
|
|
||||||
|
|
||||||
|
@component(
|
||||||
|
"merge_noun_chunks",
|
||||||
|
requires=["token.dep", "token.tag", "token.pos"],
|
||||||
|
retokenizes=True,
|
||||||
|
)
|
||||||
def merge_noun_chunks(doc):
|
def merge_noun_chunks(doc):
|
||||||
"""Merge noun chunks into a single token.
|
"""Merge noun chunks into a single token.
|
||||||
|
|
||||||
|
@ -21,6 +27,11 @@ def merge_noun_chunks(doc):
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
@component(
|
||||||
|
"merge_entities",
|
||||||
|
requires=["doc.ents", "token.ent_iob", "token.ent_type"],
|
||||||
|
retokenizes=True,
|
||||||
|
)
|
||||||
def merge_entities(doc):
|
def merge_entities(doc):
|
||||||
"""Merge entities into a single token.
|
"""Merge entities into a single token.
|
||||||
|
|
||||||
|
@ -36,6 +47,7 @@ def merge_entities(doc):
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
@component("merge_subtokens", requires=["token.dep"], retokenizes=True)
|
||||||
def merge_subtokens(doc, label="subtok"):
|
def merge_subtokens(doc, label="subtok"):
|
||||||
"""Merge subtokens into a single token.
|
"""Merge subtokens into a single token.
|
||||||
|
|
||||||
|
|
|
@ -5,9 +5,11 @@ from thinc.t2v import Pooling, max_pool, mean_pool
|
||||||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||||
|
|
||||||
from .pipes import Pipe
|
from .pipes import Pipe
|
||||||
|
from ..language import component
|
||||||
from .._ml import link_vectors_to_models
|
from .._ml import link_vectors_to_models
|
||||||
|
|
||||||
|
|
||||||
|
@component("sentencizer_hook", assigns=["doc.user_hooks"])
|
||||||
class SentenceSegmenter(object):
|
class SentenceSegmenter(object):
|
||||||
"""A simple spaCy hook, to allow custom sentence boundary detection logic
|
"""A simple spaCy hook, to allow custom sentence boundary detection logic
|
||||||
(that doesn't require the dependency parse). To change the sentence
|
(that doesn't require the dependency parse). To change the sentence
|
||||||
|
@ -17,8 +19,6 @@ class SentenceSegmenter(object):
|
||||||
and yield `Span` objects for each sentence.
|
and yield `Span` objects for each sentence.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name = "sentencizer"
|
|
||||||
|
|
||||||
def __init__(self, vocab, strategy=None):
|
def __init__(self, vocab, strategy=None):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
if strategy is None or strategy == "on_punct":
|
if strategy is None or strategy == "on_punct":
|
||||||
|
@ -44,6 +44,7 @@ class SentenceSegmenter(object):
|
||||||
yield doc[start : len(doc)]
|
yield doc[start : len(doc)]
|
||||||
|
|
||||||
|
|
||||||
|
@component("similarity", assigns=["doc.user_hooks"])
|
||||||
class SimilarityHook(Pipe):
|
class SimilarityHook(Pipe):
|
||||||
"""
|
"""
|
||||||
Experimental: A pipeline component to install a hook for supervised
|
Experimental: A pipeline component to install a hook for supervised
|
||||||
|
@ -58,8 +59,6 @@ class SimilarityHook(Pipe):
|
||||||
Where W is a vector of dimension weights, initialized to 1.
|
Where W is a vector of dimension weights, initialized to 1.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name = "similarity"
|
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
|
|
@ -8,6 +8,7 @@ from thinc.api import chain
|
||||||
from thinc.neural.util import to_categorical, copy_array, get_array_module
|
from thinc.neural.util import to_categorical, copy_array, get_array_module
|
||||||
from .. import util
|
from .. import util
|
||||||
from .pipes import Pipe
|
from .pipes import Pipe
|
||||||
|
from ..language import component
|
||||||
from .._ml import Tok2Vec, build_morphologizer_model
|
from .._ml import Tok2Vec, build_morphologizer_model
|
||||||
from .._ml import link_vectors_to_models, zero_init, flatten
|
from .._ml import link_vectors_to_models, zero_init, flatten
|
||||||
from .._ml import create_default_optimizer
|
from .._ml import create_default_optimizer
|
||||||
|
@ -18,9 +19,9 @@ from ..vocab cimport Vocab
|
||||||
from ..morphology cimport Morphology
|
from ..morphology cimport Morphology
|
||||||
|
|
||||||
|
|
||||||
|
@component("morphologizer", assigns=["token.morph", "token.pos"])
|
||||||
class Morphologizer(Pipe):
|
class Morphologizer(Pipe):
|
||||||
name = 'morphologizer'
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, **cfg):
|
def Model(cls, **cfg):
|
||||||
if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
|
if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
|
||||||
|
|
|
@ -13,7 +13,6 @@ from thinc.misc import LayerNorm
|
||||||
from thinc.neural.util import to_categorical
|
from thinc.neural.util import to_categorical
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
|
|
||||||
from .functions import merge_subtokens
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..syntax.nn_parser cimport Parser
|
from ..syntax.nn_parser cimport Parser
|
||||||
from ..syntax.ner cimport BiluoPushDown
|
from ..syntax.ner cimport BiluoPushDown
|
||||||
|
@ -21,6 +20,8 @@ from ..syntax.arc_eager cimport ArcEager
|
||||||
from ..morphology cimport Morphology
|
from ..morphology cimport Morphology
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
|
|
||||||
|
from .functions import merge_subtokens
|
||||||
|
from ..language import Language, component
|
||||||
from ..syntax import nonproj
|
from ..syntax import nonproj
|
||||||
from ..attrs import POS, ID
|
from ..attrs import POS, ID
|
||||||
from ..parts_of_speech import X
|
from ..parts_of_speech import X
|
||||||
|
@ -54,6 +55,10 @@ class Pipe(object):
|
||||||
"""Initialize a model for the pipe."""
|
"""Initialize a model for the pipe."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_nlp(cls, nlp, **cfg):
|
||||||
|
return cls(nlp.vocab, **cfg)
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
"""Create a new pipe instance."""
|
"""Create a new pipe instance."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
@ -223,11 +228,10 @@ class Pipe(object):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
@component("tensorizer", assigns=["doc.tensor"])
|
||||||
class Tensorizer(Pipe):
|
class Tensorizer(Pipe):
|
||||||
"""Pre-train position-sensitive vectors for tokens."""
|
"""Pre-train position-sensitive vectors for tokens."""
|
||||||
|
|
||||||
name = "tensorizer"
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, output_size=300, **cfg):
|
def Model(cls, output_size=300, **cfg):
|
||||||
"""Create a new statistical model for the class.
|
"""Create a new statistical model for the class.
|
||||||
|
@ -362,14 +366,13 @@ class Tensorizer(Pipe):
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
|
|
||||||
|
@component("tagger", assigns=["token.tag", "token.pos"])
|
||||||
class Tagger(Pipe):
|
class Tagger(Pipe):
|
||||||
"""Pipeline component for part-of-speech tagging.
|
"""Pipeline component for part-of-speech tagging.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger
|
DOCS: https://spacy.io/api/tagger
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name = "tagger"
|
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -657,13 +660,12 @@ class Tagger(Pipe):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
@component("nn_labeller")
|
||||||
class MultitaskObjective(Tagger):
|
class MultitaskObjective(Tagger):
|
||||||
"""Experimental: Assist training of a parser or tagger, by training a
|
"""Experimental: Assist training of a parser or tagger, by training a
|
||||||
side-objective.
|
side-objective.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name = "nn_labeller"
|
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
|
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -898,12 +900,12 @@ class ClozeMultitask(Pipe):
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
|
|
||||||
|
|
||||||
|
@component("textcat", assigns=["doc.cats"])
|
||||||
class TextCategorizer(Pipe):
|
class TextCategorizer(Pipe):
|
||||||
"""Pipeline component for text classification.
|
"""Pipeline component for text classification.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer
|
DOCS: https://spacy.io/api/textcategorizer
|
||||||
"""
|
"""
|
||||||
name = 'textcat'
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, nr_class=1, **cfg):
|
def Model(cls, nr_class=1, **cfg):
|
||||||
|
@ -1051,8 +1053,11 @@ cdef class DependencyParser(Parser):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/dependencyparser
|
DOCS: https://spacy.io/api/dependencyparser
|
||||||
"""
|
"""
|
||||||
|
# cdef classes can't have decorators, so we're defining this here
|
||||||
name = "parser"
|
name = "parser"
|
||||||
|
factory = "parser"
|
||||||
|
assigns = ["token.dep", "token.is_sent_start", "doc.sents"]
|
||||||
|
requires = []
|
||||||
TransitionSystem = ArcEager
|
TransitionSystem = ArcEager
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -1097,8 +1102,10 @@ cdef class EntityRecognizer(Parser):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityrecognizer
|
DOCS: https://spacy.io/api/entityrecognizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name = "ner"
|
name = "ner"
|
||||||
|
factory = "ner"
|
||||||
|
assigns = ["doc.ents", "token.ent_iob", "token.ent_type"]
|
||||||
|
requires = []
|
||||||
TransitionSystem = BiluoPushDown
|
TransitionSystem = BiluoPushDown
|
||||||
nr_feature = 6
|
nr_feature = 6
|
||||||
|
|
||||||
|
@ -1129,12 +1136,16 @@ cdef class EntityRecognizer(Parser):
|
||||||
return tuple(sorted(labels))
|
return tuple(sorted(labels))
|
||||||
|
|
||||||
|
|
||||||
|
@component(
|
||||||
|
"entity_linker",
|
||||||
|
requires=["doc.ents", "token.ent_iob", "token.ent_type"],
|
||||||
|
assigns=["token.ent_kb_id"]
|
||||||
|
)
|
||||||
class EntityLinker(Pipe):
|
class EntityLinker(Pipe):
|
||||||
"""Pipeline component for named entity linking.
|
"""Pipeline component for named entity linking.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker
|
DOCS: https://spacy.io/api/entitylinker
|
||||||
"""
|
"""
|
||||||
name = 'entity_linker'
|
|
||||||
NIL = "NIL" # string used to refer to a non-existing link
|
NIL = "NIL" # string used to refer to a non-existing link
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -1405,13 +1416,13 @@ class EntityLinker(Pipe):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
@component("sentencizer", assigns=["token.is_sent_start", "doc.sents"])
|
||||||
class Sentencizer(object):
|
class Sentencizer(object):
|
||||||
"""Segment the Doc into sentences using a rule-based strategy.
|
"""Segment the Doc into sentences using a rule-based strategy.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer
|
DOCS: https://spacy.io/api/sentencizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name = "sentencizer"
|
|
||||||
default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
|
default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
|
||||||
'।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
|
'।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
|
||||||
'᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
|
'᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
|
||||||
|
@ -1437,6 +1448,10 @@ class Sentencizer(object):
|
||||||
else:
|
else:
|
||||||
self.punct_chars = set(self.default_punct_chars)
|
self.punct_chars = set(self.default_punct_chars)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_nlp(cls, nlp, **cfg):
|
||||||
|
return cls(**cfg)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
||||||
|
|
||||||
|
@ -1503,4 +1518,9 @@ class Sentencizer(object):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
# Cython classes can't be decorated, so we need to add the factories here
|
||||||
|
Language.factories["parser"] = lambda nlp, **cfg: DependencyParser.from_nlp(nlp, **cfg)
|
||||||
|
Language.factories["ner"] = lambda nlp, **cfg: EntityRecognizer.from_nlp(nlp, **cfg)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer"]
|
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer"]
|
||||||
|
|
|
@ -128,6 +128,10 @@ cdef class Parser:
|
||||||
self._multitasks = []
|
self._multitasks = []
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_nlp(cls, nlp, **cfg):
|
||||||
|
return cls(nlp.vocab, **cfg)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||||
|
|
||||||
|
|
146
spacy/tests/pipeline/test_analysis.py
Normal file
146
spacy/tests/pipeline/test_analysis.py
Normal file
|
@ -0,0 +1,146 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import spacy.language
|
||||||
|
from spacy.language import Language, component
|
||||||
|
from spacy.analysis import print_summary, validate_attrs
|
||||||
|
from spacy.analysis import get_assigns_for_attr, get_requires_for_attr
|
||||||
|
from spacy.compat import is_python2
|
||||||
|
from mock import Mock, ANY
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_component_decorator_function():
|
||||||
|
@component(name="test")
|
||||||
|
def test_component(doc):
|
||||||
|
"""docstring"""
|
||||||
|
return doc
|
||||||
|
|
||||||
|
assert test_component.name == "test"
|
||||||
|
if not is_python2:
|
||||||
|
assert test_component.__doc__ == "docstring"
|
||||||
|
assert test_component("foo") == "foo"
|
||||||
|
|
||||||
|
|
||||||
|
def test_component_decorator_class():
|
||||||
|
@component(name="test")
|
||||||
|
class TestComponent(object):
|
||||||
|
"""docstring1"""
|
||||||
|
|
||||||
|
foo = "bar"
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
"""docstring2"""
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def custom(self, x):
|
||||||
|
"""docstring3"""
|
||||||
|
return x
|
||||||
|
|
||||||
|
assert TestComponent.name == "test"
|
||||||
|
assert TestComponent.foo == "bar"
|
||||||
|
assert hasattr(TestComponent, "custom")
|
||||||
|
test_component = TestComponent()
|
||||||
|
assert test_component.foo == "bar"
|
||||||
|
assert test_component("foo") == "foo"
|
||||||
|
assert hasattr(test_component, "custom")
|
||||||
|
assert test_component.custom("bar") == "bar"
|
||||||
|
if not is_python2:
|
||||||
|
assert TestComponent.__doc__ == "docstring1"
|
||||||
|
assert TestComponent.__call__.__doc__ == "docstring2"
|
||||||
|
assert TestComponent.custom.__doc__ == "docstring3"
|
||||||
|
assert test_component.__doc__ == "docstring1"
|
||||||
|
assert test_component.__call__.__doc__ == "docstring2"
|
||||||
|
assert test_component.custom.__doc__ == "docstring3"
|
||||||
|
|
||||||
|
|
||||||
|
def test_component_decorator_assigns():
|
||||||
|
spacy.language.ENABLE_PIPELINE_ANALYSIS = True
|
||||||
|
|
||||||
|
@component("c1", assigns=["token.tag", "doc.tensor"])
|
||||||
|
def test_component1(doc):
|
||||||
|
return doc
|
||||||
|
|
||||||
|
@component(
|
||||||
|
"c2", requires=["token.tag", "token.pos"], assigns=["token.lemma", "doc.tensor"]
|
||||||
|
)
|
||||||
|
def test_component2(doc):
|
||||||
|
return doc
|
||||||
|
|
||||||
|
@component("c3", requires=["token.lemma"], assigns=["token._.custom_lemma"])
|
||||||
|
def test_component3(doc):
|
||||||
|
return doc
|
||||||
|
|
||||||
|
assert "c1" in Language.factories
|
||||||
|
assert "c2" in Language.factories
|
||||||
|
assert "c3" in Language.factories
|
||||||
|
|
||||||
|
nlp = Language()
|
||||||
|
nlp.add_pipe(test_component1)
|
||||||
|
with pytest.warns(UserWarning):
|
||||||
|
nlp.add_pipe(test_component2)
|
||||||
|
nlp.add_pipe(test_component3)
|
||||||
|
assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
|
||||||
|
assert [name for name, _ in assigns_tensor] == ["c1", "c2"]
|
||||||
|
test_component4 = nlp.create_pipe("c1")
|
||||||
|
assert test_component4.name == "c1"
|
||||||
|
assert test_component4.factory == "c1"
|
||||||
|
nlp.add_pipe(test_component4, name="c4")
|
||||||
|
assert nlp.pipe_names == ["c1", "c2", "c3", "c4"]
|
||||||
|
assert "c4" not in Language.factories
|
||||||
|
assert nlp.pipe_factories["c1"] == "c1"
|
||||||
|
assert nlp.pipe_factories["c4"] == "c1"
|
||||||
|
assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
|
||||||
|
assert [name for name, _ in assigns_tensor] == ["c1", "c2", "c4"]
|
||||||
|
requires_pos = get_requires_for_attr(nlp.pipeline, "token.pos")
|
||||||
|
assert [name for name, _ in requires_pos] == ["c2"]
|
||||||
|
assert print_summary(nlp, no_print=True)
|
||||||
|
assert nlp("hello world")
|
||||||
|
|
||||||
|
|
||||||
|
def test_component_factories_from_nlp():
|
||||||
|
"""Test that class components can implement a from_nlp classmethod that
|
||||||
|
gives them access to the nlp object and config via the factory."""
|
||||||
|
|
||||||
|
class TestComponent5(object):
|
||||||
|
def __call__(self, doc):
|
||||||
|
return doc
|
||||||
|
|
||||||
|
mock = Mock()
|
||||||
|
mock.return_value = TestComponent5()
|
||||||
|
TestComponent5.from_nlp = classmethod(mock)
|
||||||
|
TestComponent5 = component("c5")(TestComponent5)
|
||||||
|
|
||||||
|
assert "c5" in Language.factories
|
||||||
|
nlp = Language()
|
||||||
|
pipe = nlp.create_pipe("c5", config={"foo": "bar"})
|
||||||
|
nlp.add_pipe(pipe)
|
||||||
|
assert nlp("hello world")
|
||||||
|
# The first argument here is the class itself, so we're accepting any here
|
||||||
|
mock.assert_called_once_with(ANY, nlp, foo="bar")
|
||||||
|
|
||||||
|
|
||||||
|
def test_analysis_validate_attrs_valid():
|
||||||
|
attrs = ["doc.sents", "doc.ents", "token.tag", "token._.xyz"]
|
||||||
|
assert validate_attrs(attrs)
|
||||||
|
for attr in attrs:
|
||||||
|
assert validate_attrs([attr])
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
validate_attrs(["doc.sents", "doc.xyz"])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"attr",
|
||||||
|
[
|
||||||
|
"doc",
|
||||||
|
"doc_ents",
|
||||||
|
"doc.xyz",
|
||||||
|
"token.xyz",
|
||||||
|
"token.tag_",
|
||||||
|
"token.tag.xyz",
|
||||||
|
"token._.xyz.abc",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_analysis_validate_attrs_invalid(attr):
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
validate_attrs([attr])
|
|
@ -247,6 +247,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
|
||||||
cls = get_lang_class(lang)
|
cls = get_lang_class(lang)
|
||||||
nlp = cls(meta=meta, **overrides)
|
nlp = cls(meta=meta, **overrides)
|
||||||
pipeline = meta.get("pipeline", [])
|
pipeline = meta.get("pipeline", [])
|
||||||
|
factories = meta.get("factories", {})
|
||||||
disable = overrides.get("disable", [])
|
disable = overrides.get("disable", [])
|
||||||
if pipeline is True:
|
if pipeline is True:
|
||||||
pipeline = nlp.Defaults.pipe_names
|
pipeline = nlp.Defaults.pipe_names
|
||||||
|
@ -255,7 +256,8 @@ def load_model_from_path(model_path, meta=False, **overrides):
|
||||||
for name in pipeline:
|
for name in pipeline:
|
||||||
if name not in disable:
|
if name not in disable:
|
||||||
config = meta.get("pipeline_args", {}).get(name, {})
|
config = meta.get("pipeline_args", {}).get(name, {})
|
||||||
component = nlp.create_pipe(name, config=config)
|
factory = factories.get(name, name)
|
||||||
|
component = nlp.create_pipe(factory, config=config)
|
||||||
nlp.add_pipe(component, name=name)
|
nlp.add_pipe(component, name=name)
|
||||||
return nlp.from_disk(model_path)
|
return nlp.from_disk(model_path)
|
||||||
|
|
||||||
|
@ -368,6 +370,16 @@ def is_in_jupyter():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_component_name(component):
|
||||||
|
if hasattr(component, "name"):
|
||||||
|
return component.name
|
||||||
|
if hasattr(component, "__name__"):
|
||||||
|
return component.__name__
|
||||||
|
if hasattr(component, "__class__") and hasattr(component.__class__, "__name__"):
|
||||||
|
return component.__class__.__name__
|
||||||
|
return repr(component)
|
||||||
|
|
||||||
|
|
||||||
def get_cuda_stream(require=False):
|
def get_cuda_stream(require=False):
|
||||||
if CudaStream is None:
|
if CudaStream is None:
|
||||||
return None
|
return None
|
||||||
|
|
Loading…
Reference in New Issue
Block a user