mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Component decorator and component analysis (#4517)
* Add work in progress * Update analysis helpers and component decorator * Fix porting of docstrings for Python 2 * Fix docstring stuff on Python 2 * Support meta factories when loading model * Put auto pipeline analysis behind flag for now * Analyse pipes on remove_pipe and replace_pipe * Move analysis to root for now Try to find a better place for it, but it needs to go for now to avoid circular imports * Simplify decorator Don't return a wrapped class and instead just write to the object * Update existing components and factories * Add condition in factory for classes vs. functions * Add missing from_nlp classmethods * Add "retokenizes" to printed overview * Update assigns/requires declarations of builtins * Only return data if no_print is enabled * Use multiline table for overview * Don't support Span * Rewrite errors/warnings and move them to spacy.errors
This commit is contained in:
		
							parent
							
								
									1180304449
								
							
						
					
					
						commit
						a9c6104047
					
				| 
						 | 
					@ -4,7 +4,7 @@ preshed>=3.0.2,<3.1.0
 | 
				
			||||||
thinc>=7.2.0,<7.3.0
 | 
					thinc>=7.2.0,<7.3.0
 | 
				
			||||||
blis>=0.4.0,<0.5.0
 | 
					blis>=0.4.0,<0.5.0
 | 
				
			||||||
murmurhash>=0.28.0,<1.1.0
 | 
					murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
wasabi>=0.2.0,<1.1.0
 | 
					wasabi>=0.3.0,<1.1.0
 | 
				
			||||||
srsly>=0.1.0,<1.1.0
 | 
					srsly>=0.1.0,<1.1.0
 | 
				
			||||||
# Third party dependencies
 | 
					# Third party dependencies
 | 
				
			||||||
numpy>=1.15.0
 | 
					numpy>=1.15.0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -49,7 +49,7 @@ install_requires =
 | 
				
			||||||
    blis>=0.4.0,<0.5.0
 | 
					    blis>=0.4.0,<0.5.0
 | 
				
			||||||
    plac>=0.9.6,<1.2.0
 | 
					    plac>=0.9.6,<1.2.0
 | 
				
			||||||
    requests>=2.13.0,<3.0.0
 | 
					    requests>=2.13.0,<3.0.0
 | 
				
			||||||
    wasabi>=0.2.0,<1.1.0
 | 
					    wasabi>=0.3.0,<1.1.0
 | 
				
			||||||
    srsly>=0.1.0,<1.1.0
 | 
					    srsly>=0.1.0,<1.1.0
 | 
				
			||||||
    pathlib==1.0.1; python_version < "3.4"
 | 
					    pathlib==1.0.1; python_version < "3.4"
 | 
				
			||||||
    importlib_metadata>=0.20; python_version < "3.8"
 | 
					    importlib_metadata>=0.20; python_version < "3.8"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,12 +9,14 @@ warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
 | 
				
			||||||
# These are imported as part of the API
 | 
					# These are imported as part of the API
 | 
				
			||||||
from thinc.neural.util import prefer_gpu, require_gpu
 | 
					from thinc.neural.util import prefer_gpu, require_gpu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from . import pipeline
 | 
				
			||||||
from .cli.info import info as cli_info
 | 
					from .cli.info import info as cli_info
 | 
				
			||||||
from .glossary import explain
 | 
					from .glossary import explain
 | 
				
			||||||
from .about import __version__
 | 
					from .about import __version__
 | 
				
			||||||
from .errors import Errors, Warnings, deprecation_warning
 | 
					from .errors import Errors, Warnings, deprecation_warning
 | 
				
			||||||
from . import util
 | 
					from . import util
 | 
				
			||||||
from .util import register_architecture, get_architecture
 | 
					from .util import register_architecture, get_architecture
 | 
				
			||||||
 | 
					from .language import component
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if sys.maxunicode == 65535:
 | 
					if sys.maxunicode == 65535:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										176
									
								
								spacy/analysis.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										176
									
								
								spacy/analysis.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,176 @@
 | 
				
			||||||
 | 
					# coding: utf8
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from collections import OrderedDict
 | 
				
			||||||
 | 
					from wasabi import Printer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .tokens import Doc, Token
 | 
				
			||||||
 | 
					from .errors import Errors, Warnings, user_warning
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def analyze_pipes(pipeline, name, pipe, index, warn=True):
 | 
				
			||||||
 | 
					    """Analyze a pipeline component with respect to its position in the current
 | 
				
			||||||
 | 
					    pipeline and the other components. Will check whether requirements are
 | 
				
			||||||
 | 
					    fulfilled (e.g. if previous components assign the attributes).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
 | 
				
			||||||
 | 
					    name (unicode): The name of the pipeline component to analyze.
 | 
				
			||||||
 | 
					    pipe (callable): The pipeline component function to analyze.
 | 
				
			||||||
 | 
					    index (int): The index of the component in the pipeline.
 | 
				
			||||||
 | 
					    warn (bool): Show user warning if problem is found.
 | 
				
			||||||
 | 
					    RETURNS (list): The problems found for the given pipeline component.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    assert pipeline[index][0] == name
 | 
				
			||||||
 | 
					    prev_pipes = pipeline[:index]
 | 
				
			||||||
 | 
					    pipe_requires = getattr(pipe, "requires", [])
 | 
				
			||||||
 | 
					    requires = OrderedDict([(annot, False) for annot in pipe_requires])
 | 
				
			||||||
 | 
					    if requires:
 | 
				
			||||||
 | 
					        for prev_name, prev_pipe in prev_pipes:
 | 
				
			||||||
 | 
					            prev_assigns = getattr(prev_pipe, "assigns", [])
 | 
				
			||||||
 | 
					            for annot in prev_assigns:
 | 
				
			||||||
 | 
					                requires[annot] = True
 | 
				
			||||||
 | 
					    problems = []
 | 
				
			||||||
 | 
					    for annot, fulfilled in requires.items():
 | 
				
			||||||
 | 
					        if not fulfilled:
 | 
				
			||||||
 | 
					            problems.append(annot)
 | 
				
			||||||
 | 
					            if warn:
 | 
				
			||||||
 | 
					                user_warning(Warnings.W025.format(name=name, attr=annot))
 | 
				
			||||||
 | 
					    return problems
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def analyze_all_pipes(pipeline, warn=True):
 | 
				
			||||||
 | 
					    """Analyze all pipes in the pipeline in order.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
 | 
				
			||||||
 | 
					    warn (bool): Show user warning if problem is found.
 | 
				
			||||||
 | 
					    RETURNS (dict): The problems found, keyed by component name.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    problems = {}
 | 
				
			||||||
 | 
					    for i, (name, pipe) in enumerate(pipeline):
 | 
				
			||||||
 | 
					        problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn)
 | 
				
			||||||
 | 
					    return problems
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def dot_to_dict(values):
 | 
				
			||||||
 | 
					    """Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"]
 | 
				
			||||||
 | 
					    become {"token": {"pos": True, "_": {"xyz": True }}}.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    values (iterable): The values to convert.
 | 
				
			||||||
 | 
					    RETURNS (dict): The converted values.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    result = {}
 | 
				
			||||||
 | 
					    for value in values:
 | 
				
			||||||
 | 
					        path = result
 | 
				
			||||||
 | 
					        parts = value.lower().split(".")
 | 
				
			||||||
 | 
					        for i, item in enumerate(parts):
 | 
				
			||||||
 | 
					            is_last = i == len(parts) - 1
 | 
				
			||||||
 | 
					            path = path.setdefault(item, True if is_last else {})
 | 
				
			||||||
 | 
					    return result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def validate_attrs(values):
 | 
				
			||||||
 | 
					    """Validate component attributes provided to "assigns", "requires" etc.
 | 
				
			||||||
 | 
					    Raises error for invalid attributes and formatting. Doesn't check if
 | 
				
			||||||
 | 
					    custom extension attributes are registered, since this is something the
 | 
				
			||||||
 | 
					    user might want to do themselves later in the component.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    values (iterable): The string attributes to check, e.g. `["token.pos"]`.
 | 
				
			||||||
 | 
					    RETURNS (iterable): The checked attributes.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    data = dot_to_dict(values)
 | 
				
			||||||
 | 
					    objs = {"doc": Doc, "token": Token}
 | 
				
			||||||
 | 
					    for obj_key, attrs in data.items():
 | 
				
			||||||
 | 
					        if obj_key not in objs:  # first element is not doc/token
 | 
				
			||||||
 | 
					            if obj_key == "span":
 | 
				
			||||||
 | 
					                span_attrs = [attr for attr in values if attr.startswith("span.")]
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E180.format(attrs=", ".join(span_attrs)))
 | 
				
			||||||
 | 
					            invalid_attrs = ", ".join(a for a in values if a.startswith(obj_key))
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E181.format(obj=obj_key, attrs=invalid_attrs))
 | 
				
			||||||
 | 
					        if not isinstance(attrs, dict):  # attr is something like "doc"
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E182.format(attr=obj_key))
 | 
				
			||||||
 | 
					        for attr, value in attrs.items():
 | 
				
			||||||
 | 
					            if attr == "_":
 | 
				
			||||||
 | 
					                if value is True:  # attr is something like "doc._"
 | 
				
			||||||
 | 
					                    raise ValueError(Errors.E182.format(attr="{}._".format(obj_key)))
 | 
				
			||||||
 | 
					                for ext_attr, ext_value in value.items():
 | 
				
			||||||
 | 
					                    # We don't check whether the attribute actually exists
 | 
				
			||||||
 | 
					                    if ext_value is not True:  # attr is something like doc._.x.y
 | 
				
			||||||
 | 
					                        good = "{}._.{}".format(obj_key, ext_attr)
 | 
				
			||||||
 | 
					                        bad = "{}.{}".format(good, ".".join(ext_value))
 | 
				
			||||||
 | 
					                        raise ValueError(Errors.E183.format(attr=bad, solution=good))
 | 
				
			||||||
 | 
					                continue  # we can't validate those further
 | 
				
			||||||
 | 
					            if attr.endswith("_"):  # attr is something like "token.pos_"
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1]))
 | 
				
			||||||
 | 
					            if value is not True:  # attr is something like doc.x.y
 | 
				
			||||||
 | 
					                good = "{}.{}".format(obj_key, attr)
 | 
				
			||||||
 | 
					                bad = "{}.{}".format(good, ".".join(value))
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E183.format(attr=bad, solution=good))
 | 
				
			||||||
 | 
					            obj = objs[obj_key]
 | 
				
			||||||
 | 
					            if not hasattr(obj, attr):
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E185.format(obj=obj_key, attr=attr))
 | 
				
			||||||
 | 
					    return values
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _get_feature_for_attr(pipeline, attr, feature):
 | 
				
			||||||
 | 
					    assert feature in ["assigns", "requires"]
 | 
				
			||||||
 | 
					    result = []
 | 
				
			||||||
 | 
					    for pipe_name, pipe in pipeline:
 | 
				
			||||||
 | 
					        pipe_assigns = getattr(pipe, feature, [])
 | 
				
			||||||
 | 
					        if attr in pipe_assigns:
 | 
				
			||||||
 | 
					            result.append((pipe_name, pipe))
 | 
				
			||||||
 | 
					    return result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_assigns_for_attr(pipeline, attr):
 | 
				
			||||||
 | 
					    """Get all pipeline components that assign an attr, e.g. "doc.tensor".
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
 | 
				
			||||||
 | 
					    attr (unicode): The attribute to check.
 | 
				
			||||||
 | 
					    RETURNS (list): (name, pipeline) tuples of components that assign the attr.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    return _get_feature_for_attr(pipeline, attr, "assigns")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_requires_for_attr(pipeline, attr):
 | 
				
			||||||
 | 
					    """Get all pipeline components that require an attr, e.g. "doc.tensor".
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
 | 
				
			||||||
 | 
					    attr (unicode): The attribute to check.
 | 
				
			||||||
 | 
					    RETURNS (list): (name, pipeline) tuples of components that require the attr.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    return _get_feature_for_attr(pipeline, attr, "requires")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def print_summary(nlp, pretty=True, no_print=False):
 | 
				
			||||||
 | 
					    """Print a formatted summary for the current nlp object's pipeline. Shows
 | 
				
			||||||
 | 
					    a table with the pipeline components and why they assign and require, as
 | 
				
			||||||
 | 
					    well as any problems if available.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    nlp (Language): The nlp object.
 | 
				
			||||||
 | 
					    pretty (bool): Pretty-print the results (color etc).
 | 
				
			||||||
 | 
					    no_print (bool): Don't print anything, just return the data.
 | 
				
			||||||
 | 
					    RETURNS (dict): A dict with "overview" and "problems".
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    msg = Printer(pretty=pretty, no_print=no_print)
 | 
				
			||||||
 | 
					    overview = []
 | 
				
			||||||
 | 
					    problems = {}
 | 
				
			||||||
 | 
					    for i, (name, pipe) in enumerate(nlp.pipeline):
 | 
				
			||||||
 | 
					        requires = getattr(pipe, "requires", [])
 | 
				
			||||||
 | 
					        assigns = getattr(pipe, "assigns", [])
 | 
				
			||||||
 | 
					        retok = getattr(pipe, "retokenizes", False)
 | 
				
			||||||
 | 
					        overview.append((i, name, requires, assigns, retok))
 | 
				
			||||||
 | 
					        problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
 | 
				
			||||||
 | 
					    msg.divider("Pipeline Overview")
 | 
				
			||||||
 | 
					    header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
 | 
				
			||||||
 | 
					    msg.table(overview, header=header, divider=True, multiline=True)
 | 
				
			||||||
 | 
					    n_problems = sum(len(p) for p in problems.values())
 | 
				
			||||||
 | 
					    if any(p for p in problems.values()):
 | 
				
			||||||
 | 
					        msg.divider("Problems ({})".format(n_problems))
 | 
				
			||||||
 | 
					        for name, problem in problems.items():
 | 
				
			||||||
 | 
					            if problem:
 | 
				
			||||||
 | 
					                problem = ", ".join(problem)
 | 
				
			||||||
 | 
					                msg.warn("'{}' requirements not met: {}".format(name, problem))
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        msg.good("No problems found.")
 | 
				
			||||||
 | 
					    if no_print:
 | 
				
			||||||
 | 
					        return {"overview": overview, "problems": problems}
 | 
				
			||||||
| 
						 | 
					@ -12,6 +12,7 @@ import os
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
import ast
 | 
					import ast
 | 
				
			||||||
 | 
					import types
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.neural.util import copy_array
 | 
					from thinc.neural.util import copy_array
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -67,6 +68,7 @@ if is_python2:
 | 
				
			||||||
    basestring_ = basestring  # noqa: F821
 | 
					    basestring_ = basestring  # noqa: F821
 | 
				
			||||||
    input_ = raw_input  # noqa: F821
 | 
					    input_ = raw_input  # noqa: F821
 | 
				
			||||||
    path2str = lambda path: str(path).decode("utf8")
 | 
					    path2str = lambda path: str(path).decode("utf8")
 | 
				
			||||||
 | 
					    class_types = (type, types.ClassType)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
elif is_python3:
 | 
					elif is_python3:
 | 
				
			||||||
    bytes_ = bytes
 | 
					    bytes_ = bytes
 | 
				
			||||||
| 
						 | 
					@ -74,6 +76,7 @@ elif is_python3:
 | 
				
			||||||
    basestring_ = str
 | 
					    basestring_ = str
 | 
				
			||||||
    input_ = input
 | 
					    input_ = input
 | 
				
			||||||
    path2str = lambda path: str(path)
 | 
					    path2str = lambda path: str(path)
 | 
				
			||||||
 | 
					    class_types = (type, types.ClassType) if is_python_pre_3_5 else type
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def b_to_str(b_str):
 | 
					def b_to_str(b_str):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -99,6 +99,8 @@ class Warnings(object):
 | 
				
			||||||
            "'n_process' will be set to 1.")
 | 
					            "'n_process' will be set to 1.")
 | 
				
			||||||
    W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
 | 
					    W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
 | 
				
			||||||
            "the Knowledge Base.")
 | 
					            "the Knowledge Base.")
 | 
				
			||||||
 | 
					    W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
 | 
				
			||||||
 | 
					            "previous components in the pipeline declare that they assign it.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@add_codes
 | 
					@add_codes
 | 
				
			||||||
| 
						 | 
					@ -511,6 +513,20 @@ class Errors(object):
 | 
				
			||||||
    E179 = ("Invalid pattern. Expected a list of Doc objects but got a single "
 | 
					    E179 = ("Invalid pattern. Expected a list of Doc objects but got a single "
 | 
				
			||||||
            "Doc. If you only want to add one pattern, make sure to wrap it "
 | 
					            "Doc. If you only want to add one pattern, make sure to wrap it "
 | 
				
			||||||
            "in a list. For example: matcher.add('{key}', [doc])")
 | 
					            "in a list. For example: matcher.add('{key}', [doc])")
 | 
				
			||||||
 | 
					    E180 = ("Span attributes can't be declared as required or assigned by "
 | 
				
			||||||
 | 
					            "components, since spans are only views of the Doc. Use Doc and "
 | 
				
			||||||
 | 
					            "Token attributes only and remove the following: {attrs}")
 | 
				
			||||||
 | 
					    E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
 | 
				
			||||||
 | 
					            "Only Doc and Token attributes are supported.")
 | 
				
			||||||
 | 
					    E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
 | 
				
			||||||
 | 
					            "to define the attribute? For example: {attr}.???")
 | 
				
			||||||
 | 
					    E183 = ("Received invalid attribute declaration: {attr}\nOnly top-level "
 | 
				
			||||||
 | 
					            "attributes are supported, for example: {solution}")
 | 
				
			||||||
 | 
					    E184 = ("Only attributes without underscores are supported in component "
 | 
				
			||||||
 | 
					            "attribute declarations (because underscore and non-underscore "
 | 
				
			||||||
 | 
					            "attributes are connected anyways): {attr} -> {solution}")
 | 
				
			||||||
 | 
					    E185 = ("Received invalid attribute in component attribute declaration: "
 | 
				
			||||||
 | 
					            "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@add_codes
 | 
					@add_codes
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -18,13 +18,8 @@ from .tokenizer import Tokenizer
 | 
				
			||||||
from .vocab import Vocab
 | 
					from .vocab import Vocab
 | 
				
			||||||
from .lemmatizer import Lemmatizer
 | 
					from .lemmatizer import Lemmatizer
 | 
				
			||||||
from .lookups import Lookups
 | 
					from .lookups import Lookups
 | 
				
			||||||
from .pipeline import DependencyParser, Tagger
 | 
					from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
 | 
				
			||||||
from .pipeline import Tensorizer, EntityRecognizer, EntityLinker
 | 
					from .compat import izip, basestring_, is_python2, class_types
 | 
				
			||||||
from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
 | 
					 | 
				
			||||||
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
 | 
					 | 
				
			||||||
from .pipeline import EntityRuler
 | 
					 | 
				
			||||||
from .pipeline import Morphologizer
 | 
					 | 
				
			||||||
from .compat import izip, basestring_, is_python2
 | 
					 | 
				
			||||||
from .gold import GoldParse
 | 
					from .gold import GoldParse
 | 
				
			||||||
from .scorer import Scorer
 | 
					from .scorer import Scorer
 | 
				
			||||||
from ._ml import link_vectors_to_models, create_default_optimizer
 | 
					from ._ml import link_vectors_to_models, create_default_optimizer
 | 
				
			||||||
| 
						 | 
					@ -40,6 +35,9 @@ from . import util
 | 
				
			||||||
from . import about
 | 
					from . import about
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ENABLE_PIPELINE_ANALYSIS = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class BaseDefaults(object):
 | 
					class BaseDefaults(object):
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def create_lemmatizer(cls, nlp=None, lookups=None):
 | 
					    def create_lemmatizer(cls, nlp=None, lookups=None):
 | 
				
			||||||
| 
						 | 
					@ -135,19 +133,6 @@ class Language(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    factories = {
 | 
					    factories = {
 | 
				
			||||||
        "tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp),
 | 
					        "tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp),
 | 
				
			||||||
        "tensorizer": lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
 | 
					 | 
				
			||||||
        "tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
 | 
					 | 
				
			||||||
        "morphologizer": lambda nlp, **cfg: Morphologizer(nlp.vocab, **cfg),
 | 
					 | 
				
			||||||
        "parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
 | 
					 | 
				
			||||||
        "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
 | 
					 | 
				
			||||||
        "entity_linker": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
 | 
					 | 
				
			||||||
        "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
 | 
					 | 
				
			||||||
        "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
 | 
					 | 
				
			||||||
        "sentencizer": lambda nlp, **cfg: Sentencizer(**cfg),
 | 
					 | 
				
			||||||
        "merge_noun_chunks": lambda nlp, **cfg: merge_noun_chunks,
 | 
					 | 
				
			||||||
        "merge_entities": lambda nlp, **cfg: merge_entities,
 | 
					 | 
				
			||||||
        "merge_subtokens": lambda nlp, **cfg: merge_subtokens,
 | 
					 | 
				
			||||||
        "entity_ruler": lambda nlp, **cfg: EntityRuler(nlp, **cfg),
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(
 | 
					    def __init__(
 | 
				
			||||||
| 
						 | 
					@ -218,6 +203,7 @@ class Language(object):
 | 
				
			||||||
            "name": self.vocab.vectors.name,
 | 
					            "name": self.vocab.vectors.name,
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        self._meta["pipeline"] = self.pipe_names
 | 
					        self._meta["pipeline"] = self.pipe_names
 | 
				
			||||||
 | 
					        self._meta["factories"] = self.pipe_factories
 | 
				
			||||||
        self._meta["labels"] = self.pipe_labels
 | 
					        self._meta["labels"] = self.pipe_labels
 | 
				
			||||||
        return self._meta
 | 
					        return self._meta
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -259,6 +245,17 @@ class Language(object):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        return [pipe_name for pipe_name, _ in self.pipeline]
 | 
					        return [pipe_name for pipe_name, _ in self.pipeline]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def pipe_factories(self):
 | 
				
			||||||
 | 
					        """Get the component factories for the available pipeline components.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        RETURNS (dict): Factory names, keyed by component names.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        factories = {}
 | 
				
			||||||
 | 
					        for pipe_name, pipe in self.pipeline:
 | 
				
			||||||
 | 
					            factories[pipe_name] = getattr(pipe, "factory", pipe_name)
 | 
				
			||||||
 | 
					        return factories
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def pipe_labels(self):
 | 
					    def pipe_labels(self):
 | 
				
			||||||
        """Get the labels set by the pipeline components, if available (if
 | 
					        """Get the labels set by the pipeline components, if available (if
 | 
				
			||||||
| 
						 | 
					@ -327,33 +324,30 @@ class Language(object):
 | 
				
			||||||
                msg += Errors.E004.format(component=component)
 | 
					                msg += Errors.E004.format(component=component)
 | 
				
			||||||
            raise ValueError(msg)
 | 
					            raise ValueError(msg)
 | 
				
			||||||
        if name is None:
 | 
					        if name is None:
 | 
				
			||||||
            if hasattr(component, "name"):
 | 
					            name = util.get_component_name(component)
 | 
				
			||||||
                name = component.name
 | 
					 | 
				
			||||||
            elif hasattr(component, "__name__"):
 | 
					 | 
				
			||||||
                name = component.__name__
 | 
					 | 
				
			||||||
            elif hasattr(component, "__class__") and hasattr(
 | 
					 | 
				
			||||||
                component.__class__, "__name__"
 | 
					 | 
				
			||||||
            ):
 | 
					 | 
				
			||||||
                name = component.__class__.__name__
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                name = repr(component)
 | 
					 | 
				
			||||||
        if name in self.pipe_names:
 | 
					        if name in self.pipe_names:
 | 
				
			||||||
            raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names))
 | 
					            raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names))
 | 
				
			||||||
        if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
 | 
					        if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
 | 
				
			||||||
            raise ValueError(Errors.E006)
 | 
					            raise ValueError(Errors.E006)
 | 
				
			||||||
 | 
					        pipe_index = 0
 | 
				
			||||||
        pipe = (name, component)
 | 
					        pipe = (name, component)
 | 
				
			||||||
        if last or not any([first, before, after]):
 | 
					        if last or not any([first, before, after]):
 | 
				
			||||||
 | 
					            pipe_index = len(self.pipeline)
 | 
				
			||||||
            self.pipeline.append(pipe)
 | 
					            self.pipeline.append(pipe)
 | 
				
			||||||
        elif first:
 | 
					        elif first:
 | 
				
			||||||
            self.pipeline.insert(0, pipe)
 | 
					            self.pipeline.insert(0, pipe)
 | 
				
			||||||
        elif before and before in self.pipe_names:
 | 
					        elif before and before in self.pipe_names:
 | 
				
			||||||
 | 
					            pipe_index = self.pipe_names.index(before)
 | 
				
			||||||
            self.pipeline.insert(self.pipe_names.index(before), pipe)
 | 
					            self.pipeline.insert(self.pipe_names.index(before), pipe)
 | 
				
			||||||
        elif after and after in self.pipe_names:
 | 
					        elif after and after in self.pipe_names:
 | 
				
			||||||
 | 
					            pipe_index = self.pipe_names.index(after) + 1
 | 
				
			||||||
            self.pipeline.insert(self.pipe_names.index(after) + 1, pipe)
 | 
					            self.pipeline.insert(self.pipe_names.index(after) + 1, pipe)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            raise ValueError(
 | 
					            raise ValueError(
 | 
				
			||||||
                Errors.E001.format(name=before or after, opts=self.pipe_names)
 | 
					                Errors.E001.format(name=before or after, opts=self.pipe_names)
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					        if ENABLE_PIPELINE_ANALYSIS:
 | 
				
			||||||
 | 
					            analyze_pipes(self.pipeline, name, component, pipe_index)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def has_pipe(self, name):
 | 
					    def has_pipe(self, name):
 | 
				
			||||||
        """Check if a component name is present in the pipeline. Equivalent to
 | 
					        """Check if a component name is present in the pipeline. Equivalent to
 | 
				
			||||||
| 
						 | 
					@ -382,6 +376,8 @@ class Language(object):
 | 
				
			||||||
                msg += Errors.E135.format(name=name)
 | 
					                msg += Errors.E135.format(name=name)
 | 
				
			||||||
            raise ValueError(msg)
 | 
					            raise ValueError(msg)
 | 
				
			||||||
        self.pipeline[self.pipe_names.index(name)] = (name, component)
 | 
					        self.pipeline[self.pipe_names.index(name)] = (name, component)
 | 
				
			||||||
 | 
					        if ENABLE_PIPELINE_ANALYSIS:
 | 
				
			||||||
 | 
					            analyze_all_pipes(self.pipeline)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def rename_pipe(self, old_name, new_name):
 | 
					    def rename_pipe(self, old_name, new_name):
 | 
				
			||||||
        """Rename a pipeline component.
 | 
					        """Rename a pipeline component.
 | 
				
			||||||
| 
						 | 
					@ -408,6 +404,8 @@ class Language(object):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        if name not in self.pipe_names:
 | 
					        if name not in self.pipe_names:
 | 
				
			||||||
            raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
 | 
					            raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
 | 
				
			||||||
 | 
					        if ENABLE_PIPELINE_ANALYSIS:
 | 
				
			||||||
 | 
					            analyze_all_pipes(self.pipeline)
 | 
				
			||||||
        return self.pipeline.pop(self.pipe_names.index(name))
 | 
					        return self.pipeline.pop(self.pipe_names.index(name))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, text, disable=[], component_cfg=None):
 | 
					    def __call__(self, text, disable=[], component_cfg=None):
 | 
				
			||||||
| 
						 | 
					@ -1001,6 +999,52 @@ class Language(object):
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class component(object):
 | 
				
			||||||
 | 
					    """Decorator for pipeline components. Can decorate both function components
 | 
				
			||||||
 | 
					    and class components and will automatically register components in the
 | 
				
			||||||
 | 
					    Language.factories. If the component is a class and needs access to the
 | 
				
			||||||
 | 
					    nlp object or config parameters, it can expose a from_nlp classmethod
 | 
				
			||||||
 | 
					    that takes the nlp object and **cfg arguments and returns the initialized
 | 
				
			||||||
 | 
					    component.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # NB: This decorator needs to live here, because it needs to write to
 | 
				
			||||||
 | 
					    # Language.factories. All other solutions would cause circular import.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self, name=None, assigns=tuple(), requires=tuple(), retokenizes=False):
 | 
				
			||||||
 | 
					        """Decorate a pipeline component.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        name (unicode): Default component and factory name.
 | 
				
			||||||
 | 
					        assigns (list): Attributes assigned by component, e.g. `["token.pos"]`.
 | 
				
			||||||
 | 
					        requires (list): Attributes required by component, e.g. `["token.dep"]`.
 | 
				
			||||||
 | 
					        retokenizes (bool): Whether the component changes the tokenization.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        self.name = name
 | 
				
			||||||
 | 
					        self.assigns = validate_attrs(assigns)
 | 
				
			||||||
 | 
					        self.requires = validate_attrs(requires)
 | 
				
			||||||
 | 
					        self.retokenizes = retokenizes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __call__(self, *args, **kwargs):
 | 
				
			||||||
 | 
					        obj = args[0]
 | 
				
			||||||
 | 
					        args = args[1:]
 | 
				
			||||||
 | 
					        factory_name = self.name or util.get_component_name(obj)
 | 
				
			||||||
 | 
					        obj.name = factory_name
 | 
				
			||||||
 | 
					        obj.factory = factory_name
 | 
				
			||||||
 | 
					        obj.assigns = self.assigns
 | 
				
			||||||
 | 
					        obj.requires = self.requires
 | 
				
			||||||
 | 
					        obj.retokenizes = self.retokenizes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def factory(nlp, **cfg):
 | 
				
			||||||
 | 
					            if hasattr(obj, "from_nlp"):
 | 
				
			||||||
 | 
					                return obj.from_nlp(nlp, **cfg)
 | 
				
			||||||
 | 
					            elif isinstance(obj, class_types):
 | 
				
			||||||
 | 
					                return obj()
 | 
				
			||||||
 | 
					            return obj
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        Language.factories[obj.factory] = factory
 | 
				
			||||||
 | 
					        return obj
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _fix_pretrained_vectors_name(nlp):
 | 
					def _fix_pretrained_vectors_name(nlp):
 | 
				
			||||||
    # TODO: Replace this once we handle vectors consistently as static
 | 
					    # TODO: Replace this once we handle vectors consistently as static
 | 
				
			||||||
    # data
 | 
					    # data
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,6 +4,7 @@ from __future__ import unicode_literals
 | 
				
			||||||
from collections import defaultdict, OrderedDict
 | 
					from collections import defaultdict, OrderedDict
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..language import component
 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
from ..compat import basestring_
 | 
					from ..compat import basestring_
 | 
				
			||||||
from ..util import ensure_path, to_disk, from_disk
 | 
					from ..util import ensure_path, to_disk, from_disk
 | 
				
			||||||
| 
						 | 
					@ -13,6 +14,7 @@ from ..matcher import Matcher, PhraseMatcher
 | 
				
			||||||
DEFAULT_ENT_ID_SEP = "||"
 | 
					DEFAULT_ENT_ID_SEP = "||"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@component("entity_ruler", assigns=["doc.ents", "token.ent_type", "token.ent_iob"])
 | 
				
			||||||
class EntityRuler(object):
 | 
					class EntityRuler(object):
 | 
				
			||||||
    """The EntityRuler lets you add spans to the `Doc.ents` using token-based
 | 
					    """The EntityRuler lets you add spans to the `Doc.ents` using token-based
 | 
				
			||||||
    rules or exact phrase matches. It can be combined with the statistical
 | 
					    rules or exact phrase matches. It can be combined with the statistical
 | 
				
			||||||
| 
						 | 
					@ -24,8 +26,6 @@ class EntityRuler(object):
 | 
				
			||||||
    USAGE: https://spacy.io/usage/rule-based-matching#entityruler
 | 
					    USAGE: https://spacy.io/usage/rule-based-matching#entityruler
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    name = "entity_ruler"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg):
 | 
					    def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg):
 | 
				
			||||||
        """Initialize the entitiy ruler. If patterns are supplied here, they
 | 
					        """Initialize the entitiy ruler. If patterns are supplied here, they
 | 
				
			||||||
        need to be a list of dictionaries with a `"label"` and `"pattern"`
 | 
					        need to be a list of dictionaries with a `"label"` and `"pattern"`
 | 
				
			||||||
| 
						 | 
					@ -69,6 +69,10 @@ class EntityRuler(object):
 | 
				
			||||||
        if patterns is not None:
 | 
					        if patterns is not None:
 | 
				
			||||||
            self.add_patterns(patterns)
 | 
					            self.add_patterns(patterns)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def from_nlp(cls, nlp, **cfg):
 | 
				
			||||||
 | 
					        return cls(nlp, **cfg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __len__(self):
 | 
					    def __len__(self):
 | 
				
			||||||
        """The number of all patterns added to the entity ruler."""
 | 
					        """The number of all patterns added to the entity ruler."""
 | 
				
			||||||
        n_token_patterns = sum(len(p) for p in self.token_patterns.values())
 | 
					        n_token_patterns = sum(len(p) for p in self.token_patterns.values())
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,15 @@
 | 
				
			||||||
# coding: utf8
 | 
					# coding: utf8
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..language import component
 | 
				
			||||||
from ..matcher import Matcher
 | 
					from ..matcher import Matcher
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@component(
 | 
				
			||||||
 | 
					    "merge_noun_chunks",
 | 
				
			||||||
 | 
					    requires=["token.dep", "token.tag", "token.pos"],
 | 
				
			||||||
 | 
					    retokenizes=True,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
def merge_noun_chunks(doc):
 | 
					def merge_noun_chunks(doc):
 | 
				
			||||||
    """Merge noun chunks into a single token.
 | 
					    """Merge noun chunks into a single token.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -21,6 +27,11 @@ def merge_noun_chunks(doc):
 | 
				
			||||||
    return doc
 | 
					    return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@component(
 | 
				
			||||||
 | 
					    "merge_entities",
 | 
				
			||||||
 | 
					    requires=["doc.ents", "token.ent_iob", "token.ent_type"],
 | 
				
			||||||
 | 
					    retokenizes=True,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
def merge_entities(doc):
 | 
					def merge_entities(doc):
 | 
				
			||||||
    """Merge entities into a single token.
 | 
					    """Merge entities into a single token.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -36,6 +47,7 @@ def merge_entities(doc):
 | 
				
			||||||
    return doc
 | 
					    return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@component("merge_subtokens", requires=["token.dep"], retokenizes=True)
 | 
				
			||||||
def merge_subtokens(doc, label="subtok"):
 | 
					def merge_subtokens(doc, label="subtok"):
 | 
				
			||||||
    """Merge subtokens into a single token.
 | 
					    """Merge subtokens into a single token.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,9 +5,11 @@ from thinc.t2v import Pooling, max_pool, mean_pool
 | 
				
			||||||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
 | 
					from thinc.neural._classes.difference import Siamese, CauchySimilarity
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .pipes import Pipe
 | 
					from .pipes import Pipe
 | 
				
			||||||
 | 
					from ..language import component
 | 
				
			||||||
from .._ml import link_vectors_to_models
 | 
					from .._ml import link_vectors_to_models
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@component("sentencizer_hook", assigns=["doc.user_hooks"])
 | 
				
			||||||
class SentenceSegmenter(object):
 | 
					class SentenceSegmenter(object):
 | 
				
			||||||
    """A simple spaCy hook, to allow custom sentence boundary detection logic
 | 
					    """A simple spaCy hook, to allow custom sentence boundary detection logic
 | 
				
			||||||
    (that doesn't require the dependency parse). To change the sentence
 | 
					    (that doesn't require the dependency parse). To change the sentence
 | 
				
			||||||
| 
						 | 
					@ -17,8 +19,6 @@ class SentenceSegmenter(object):
 | 
				
			||||||
    and yield `Span` objects for each sentence.
 | 
					    and yield `Span` objects for each sentence.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    name = "sentencizer"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __init__(self, vocab, strategy=None):
 | 
					    def __init__(self, vocab, strategy=None):
 | 
				
			||||||
        self.vocab = vocab
 | 
					        self.vocab = vocab
 | 
				
			||||||
        if strategy is None or strategy == "on_punct":
 | 
					        if strategy is None or strategy == "on_punct":
 | 
				
			||||||
| 
						 | 
					@ -44,6 +44,7 @@ class SentenceSegmenter(object):
 | 
				
			||||||
            yield doc[start : len(doc)]
 | 
					            yield doc[start : len(doc)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@component("similarity", assigns=["doc.user_hooks"])
 | 
				
			||||||
class SimilarityHook(Pipe):
 | 
					class SimilarityHook(Pipe):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Experimental: A pipeline component to install a hook for supervised
 | 
					    Experimental: A pipeline component to install a hook for supervised
 | 
				
			||||||
| 
						 | 
					@ -58,8 +59,6 @@ class SimilarityHook(Pipe):
 | 
				
			||||||
    Where W is a vector of dimension weights, initialized to 1.
 | 
					    Where W is a vector of dimension weights, initialized to 1.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    name = "similarity"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __init__(self, vocab, model=True, **cfg):
 | 
					    def __init__(self, vocab, model=True, **cfg):
 | 
				
			||||||
        self.vocab = vocab
 | 
					        self.vocab = vocab
 | 
				
			||||||
        self.model = model
 | 
					        self.model = model
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,6 +8,7 @@ from thinc.api import chain
 | 
				
			||||||
from thinc.neural.util import to_categorical, copy_array, get_array_module
 | 
					from thinc.neural.util import to_categorical, copy_array, get_array_module
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from .pipes import Pipe
 | 
					from .pipes import Pipe
 | 
				
			||||||
 | 
					from ..language import component
 | 
				
			||||||
from .._ml import Tok2Vec, build_morphologizer_model
 | 
					from .._ml import Tok2Vec, build_morphologizer_model
 | 
				
			||||||
from .._ml import link_vectors_to_models, zero_init, flatten
 | 
					from .._ml import link_vectors_to_models, zero_init, flatten
 | 
				
			||||||
from .._ml import create_default_optimizer
 | 
					from .._ml import create_default_optimizer
 | 
				
			||||||
| 
						 | 
					@ -18,8 +19,8 @@ from ..vocab cimport Vocab
 | 
				
			||||||
from ..morphology cimport Morphology
 | 
					from ..morphology cimport Morphology
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@component("morphologizer", assigns=["token.morph", "token.pos"])
 | 
				
			||||||
class Morphologizer(Pipe):
 | 
					class Morphologizer(Pipe):
 | 
				
			||||||
    name = 'morphologizer'
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def Model(cls, **cfg):
 | 
					    def Model(cls, **cfg):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,7 +13,6 @@ from thinc.misc import LayerNorm
 | 
				
			||||||
from thinc.neural.util import to_categorical
 | 
					from thinc.neural.util import to_categorical
 | 
				
			||||||
from thinc.neural.util import get_array_module
 | 
					from thinc.neural.util import get_array_module
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .functions import merge_subtokens
 | 
					 | 
				
			||||||
from ..tokens.doc cimport Doc
 | 
					from ..tokens.doc cimport Doc
 | 
				
			||||||
from ..syntax.nn_parser cimport Parser
 | 
					from ..syntax.nn_parser cimport Parser
 | 
				
			||||||
from ..syntax.ner cimport BiluoPushDown
 | 
					from ..syntax.ner cimport BiluoPushDown
 | 
				
			||||||
| 
						 | 
					@ -21,6 +20,8 @@ from ..syntax.arc_eager cimport ArcEager
 | 
				
			||||||
from ..morphology cimport Morphology
 | 
					from ..morphology cimport Morphology
 | 
				
			||||||
from ..vocab cimport Vocab
 | 
					from ..vocab cimport Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .functions import merge_subtokens
 | 
				
			||||||
 | 
					from ..language import Language, component
 | 
				
			||||||
from ..syntax import nonproj
 | 
					from ..syntax import nonproj
 | 
				
			||||||
from ..attrs import POS, ID
 | 
					from ..attrs import POS, ID
 | 
				
			||||||
from ..parts_of_speech import X
 | 
					from ..parts_of_speech import X
 | 
				
			||||||
| 
						 | 
					@ -54,6 +55,10 @@ class Pipe(object):
 | 
				
			||||||
        """Initialize a model for the pipe."""
 | 
					        """Initialize a model for the pipe."""
 | 
				
			||||||
        raise NotImplementedError
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def from_nlp(cls, nlp, **cfg):
 | 
				
			||||||
 | 
					        return cls(nlp.vocab, **cfg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, vocab, model=True, **cfg):
 | 
					    def __init__(self, vocab, model=True, **cfg):
 | 
				
			||||||
        """Create a new pipe instance."""
 | 
					        """Create a new pipe instance."""
 | 
				
			||||||
        raise NotImplementedError
 | 
					        raise NotImplementedError
 | 
				
			||||||
| 
						 | 
					@ -223,11 +228,10 @@ class Pipe(object):
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@component("tensorizer", assigns=["doc.tensor"])
 | 
				
			||||||
class Tensorizer(Pipe):
 | 
					class Tensorizer(Pipe):
 | 
				
			||||||
    """Pre-train position-sensitive vectors for tokens."""
 | 
					    """Pre-train position-sensitive vectors for tokens."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    name = "tensorizer"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def Model(cls, output_size=300, **cfg):
 | 
					    def Model(cls, output_size=300, **cfg):
 | 
				
			||||||
        """Create a new statistical model for the class.
 | 
					        """Create a new statistical model for the class.
 | 
				
			||||||
| 
						 | 
					@ -362,14 +366,13 @@ class Tensorizer(Pipe):
 | 
				
			||||||
        return sgd
 | 
					        return sgd
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@component("tagger", assigns=["token.tag", "token.pos"])
 | 
				
			||||||
class Tagger(Pipe):
 | 
					class Tagger(Pipe):
 | 
				
			||||||
    """Pipeline component for part-of-speech tagging.
 | 
					    """Pipeline component for part-of-speech tagging.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DOCS: https://spacy.io/api/tagger
 | 
					    DOCS: https://spacy.io/api/tagger
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    name = "tagger"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __init__(self, vocab, model=True, **cfg):
 | 
					    def __init__(self, vocab, model=True, **cfg):
 | 
				
			||||||
        self.vocab = vocab
 | 
					        self.vocab = vocab
 | 
				
			||||||
        self.model = model
 | 
					        self.model = model
 | 
				
			||||||
| 
						 | 
					@ -657,13 +660,12 @@ class Tagger(Pipe):
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@component("nn_labeller")
 | 
				
			||||||
class MultitaskObjective(Tagger):
 | 
					class MultitaskObjective(Tagger):
 | 
				
			||||||
    """Experimental: Assist training of a parser or tagger, by training a
 | 
					    """Experimental: Assist training of a parser or tagger, by training a
 | 
				
			||||||
    side-objective.
 | 
					    side-objective.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    name = "nn_labeller"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
 | 
					    def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
 | 
				
			||||||
        self.vocab = vocab
 | 
					        self.vocab = vocab
 | 
				
			||||||
        self.model = model
 | 
					        self.model = model
 | 
				
			||||||
| 
						 | 
					@ -898,12 +900,12 @@ class ClozeMultitask(Pipe):
 | 
				
			||||||
            losses[self.name] += loss
 | 
					            losses[self.name] += loss
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@component("textcat", assigns=["doc.cats"])
 | 
				
			||||||
class TextCategorizer(Pipe):
 | 
					class TextCategorizer(Pipe):
 | 
				
			||||||
    """Pipeline component for text classification.
 | 
					    """Pipeline component for text classification.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DOCS: https://spacy.io/api/textcategorizer
 | 
					    DOCS: https://spacy.io/api/textcategorizer
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    name = 'textcat'
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def Model(cls, nr_class=1, **cfg):
 | 
					    def Model(cls, nr_class=1, **cfg):
 | 
				
			||||||
| 
						 | 
					@ -1051,8 +1053,11 @@ cdef class DependencyParser(Parser):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DOCS: https://spacy.io/api/dependencyparser
 | 
					    DOCS: https://spacy.io/api/dependencyparser
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					    # cdef classes can't have decorators, so we're defining this here
 | 
				
			||||||
    name = "parser"
 | 
					    name = "parser"
 | 
				
			||||||
 | 
					    factory = "parser"
 | 
				
			||||||
 | 
					    assigns = ["token.dep", "token.is_sent_start", "doc.sents"]
 | 
				
			||||||
 | 
					    requires = []
 | 
				
			||||||
    TransitionSystem = ArcEager
 | 
					    TransitionSystem = ArcEager
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
| 
						 | 
					@ -1097,8 +1102,10 @@ cdef class EntityRecognizer(Parser):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DOCS: https://spacy.io/api/entityrecognizer
 | 
					    DOCS: https://spacy.io/api/entityrecognizer
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					 | 
				
			||||||
    name = "ner"
 | 
					    name = "ner"
 | 
				
			||||||
 | 
					    factory = "ner"
 | 
				
			||||||
 | 
					    assigns = ["doc.ents", "token.ent_iob", "token.ent_type"]
 | 
				
			||||||
 | 
					    requires = []
 | 
				
			||||||
    TransitionSystem = BiluoPushDown
 | 
					    TransitionSystem = BiluoPushDown
 | 
				
			||||||
    nr_feature = 6
 | 
					    nr_feature = 6
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1129,12 +1136,16 @@ cdef class EntityRecognizer(Parser):
 | 
				
			||||||
        return tuple(sorted(labels))
 | 
					        return tuple(sorted(labels))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@component(
 | 
				
			||||||
 | 
					    "entity_linker",
 | 
				
			||||||
 | 
					    requires=["doc.ents", "token.ent_iob", "token.ent_type"],
 | 
				
			||||||
 | 
					    assigns=["token.ent_kb_id"]
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
class EntityLinker(Pipe):
 | 
					class EntityLinker(Pipe):
 | 
				
			||||||
    """Pipeline component for named entity linking.
 | 
					    """Pipeline component for named entity linking.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DOCS: https://spacy.io/api/entitylinker
 | 
					    DOCS: https://spacy.io/api/entitylinker
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    name = 'entity_linker'
 | 
					 | 
				
			||||||
    NIL = "NIL"  # string used to refer to a non-existing link
 | 
					    NIL = "NIL"  # string used to refer to a non-existing link
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
| 
						 | 
					@ -1405,13 +1416,13 @@ class EntityLinker(Pipe):
 | 
				
			||||||
        raise NotImplementedError
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@component("sentencizer", assigns=["token.is_sent_start", "doc.sents"])
 | 
				
			||||||
class Sentencizer(object):
 | 
					class Sentencizer(object):
 | 
				
			||||||
    """Segment the Doc into sentences using a rule-based strategy.
 | 
					    """Segment the Doc into sentences using a rule-based strategy.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DOCS: https://spacy.io/api/sentencizer
 | 
					    DOCS: https://spacy.io/api/sentencizer
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    name = "sentencizer"
 | 
					 | 
				
			||||||
    default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
 | 
					    default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
 | 
				
			||||||
            '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
 | 
					            '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
 | 
				
			||||||
            '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
 | 
					            '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
 | 
				
			||||||
| 
						 | 
					@ -1437,6 +1448,10 @@ class Sentencizer(object):
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            self.punct_chars = set(self.default_punct_chars)
 | 
					            self.punct_chars = set(self.default_punct_chars)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def from_nlp(cls, nlp, **cfg):
 | 
				
			||||||
 | 
					        return cls(**cfg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, doc):
 | 
					    def __call__(self, doc):
 | 
				
			||||||
        """Apply the sentencizer to a Doc and set Token.is_sent_start.
 | 
					        """Apply the sentencizer to a Doc and set Token.is_sent_start.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1503,4 +1518,9 @@ class Sentencizer(object):
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Cython classes can't be decorated, so we need to add the factories here
 | 
				
			||||||
 | 
					Language.factories["parser"] = lambda nlp, **cfg: DependencyParser.from_nlp(nlp, **cfg)
 | 
				
			||||||
 | 
					Language.factories["ner"] = lambda nlp, **cfg: EntityRecognizer.from_nlp(nlp, **cfg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer"]
 | 
					__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -128,6 +128,10 @@ cdef class Parser:
 | 
				
			||||||
        self._multitasks = []
 | 
					        self._multitasks = []
 | 
				
			||||||
        self._rehearsal_model = None
 | 
					        self._rehearsal_model = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def from_nlp(cls, nlp, **cfg):
 | 
				
			||||||
 | 
					        return cls(nlp.vocab, **cfg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __reduce__(self):
 | 
					    def __reduce__(self):
 | 
				
			||||||
        return (Parser, (self.vocab, self.moves, self.model), None, None)
 | 
					        return (Parser, (self.vocab, self.moves, self.model), None, None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										146
									
								
								spacy/tests/pipeline/test_analysis.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										146
									
								
								spacy/tests/pipeline/test_analysis.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,146 @@
 | 
				
			||||||
 | 
					# coding: utf8
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import spacy.language
 | 
				
			||||||
 | 
					from spacy.language import Language, component
 | 
				
			||||||
 | 
					from spacy.analysis import print_summary, validate_attrs
 | 
				
			||||||
 | 
					from spacy.analysis import get_assigns_for_attr, get_requires_for_attr
 | 
				
			||||||
 | 
					from spacy.compat import is_python2
 | 
				
			||||||
 | 
					from mock import Mock, ANY
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_component_decorator_function():
 | 
				
			||||||
 | 
					    @component(name="test")
 | 
				
			||||||
 | 
					    def test_component(doc):
 | 
				
			||||||
 | 
					        """docstring"""
 | 
				
			||||||
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert test_component.name == "test"
 | 
				
			||||||
 | 
					    if not is_python2:
 | 
				
			||||||
 | 
					        assert test_component.__doc__ == "docstring"
 | 
				
			||||||
 | 
					    assert test_component("foo") == "foo"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_component_decorator_class():
 | 
				
			||||||
 | 
					    @component(name="test")
 | 
				
			||||||
 | 
					    class TestComponent(object):
 | 
				
			||||||
 | 
					        """docstring1"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        foo = "bar"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def __call__(self, doc):
 | 
				
			||||||
 | 
					            """docstring2"""
 | 
				
			||||||
 | 
					            return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def custom(self, x):
 | 
				
			||||||
 | 
					            """docstring3"""
 | 
				
			||||||
 | 
					            return x
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert TestComponent.name == "test"
 | 
				
			||||||
 | 
					    assert TestComponent.foo == "bar"
 | 
				
			||||||
 | 
					    assert hasattr(TestComponent, "custom")
 | 
				
			||||||
 | 
					    test_component = TestComponent()
 | 
				
			||||||
 | 
					    assert test_component.foo == "bar"
 | 
				
			||||||
 | 
					    assert test_component("foo") == "foo"
 | 
				
			||||||
 | 
					    assert hasattr(test_component, "custom")
 | 
				
			||||||
 | 
					    assert test_component.custom("bar") == "bar"
 | 
				
			||||||
 | 
					    if not is_python2:
 | 
				
			||||||
 | 
					        assert TestComponent.__doc__ == "docstring1"
 | 
				
			||||||
 | 
					        assert TestComponent.__call__.__doc__ == "docstring2"
 | 
				
			||||||
 | 
					        assert TestComponent.custom.__doc__ == "docstring3"
 | 
				
			||||||
 | 
					        assert test_component.__doc__ == "docstring1"
 | 
				
			||||||
 | 
					        assert test_component.__call__.__doc__ == "docstring2"
 | 
				
			||||||
 | 
					        assert test_component.custom.__doc__ == "docstring3"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_component_decorator_assigns():
 | 
				
			||||||
 | 
					    spacy.language.ENABLE_PIPELINE_ANALYSIS = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @component("c1", assigns=["token.tag", "doc.tensor"])
 | 
				
			||||||
 | 
					    def test_component1(doc):
 | 
				
			||||||
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @component(
 | 
				
			||||||
 | 
					        "c2", requires=["token.tag", "token.pos"], assigns=["token.lemma", "doc.tensor"]
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    def test_component2(doc):
 | 
				
			||||||
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @component("c3", requires=["token.lemma"], assigns=["token._.custom_lemma"])
 | 
				
			||||||
 | 
					    def test_component3(doc):
 | 
				
			||||||
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert "c1" in Language.factories
 | 
				
			||||||
 | 
					    assert "c2" in Language.factories
 | 
				
			||||||
 | 
					    assert "c3" in Language.factories
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    nlp = Language()
 | 
				
			||||||
 | 
					    nlp.add_pipe(test_component1)
 | 
				
			||||||
 | 
					    with pytest.warns(UserWarning):
 | 
				
			||||||
 | 
					        nlp.add_pipe(test_component2)
 | 
				
			||||||
 | 
					    nlp.add_pipe(test_component3)
 | 
				
			||||||
 | 
					    assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
 | 
				
			||||||
 | 
					    assert [name for name, _ in assigns_tensor] == ["c1", "c2"]
 | 
				
			||||||
 | 
					    test_component4 = nlp.create_pipe("c1")
 | 
				
			||||||
 | 
					    assert test_component4.name == "c1"
 | 
				
			||||||
 | 
					    assert test_component4.factory == "c1"
 | 
				
			||||||
 | 
					    nlp.add_pipe(test_component4, name="c4")
 | 
				
			||||||
 | 
					    assert nlp.pipe_names == ["c1", "c2", "c3", "c4"]
 | 
				
			||||||
 | 
					    assert "c4" not in Language.factories
 | 
				
			||||||
 | 
					    assert nlp.pipe_factories["c1"] == "c1"
 | 
				
			||||||
 | 
					    assert nlp.pipe_factories["c4"] == "c1"
 | 
				
			||||||
 | 
					    assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
 | 
				
			||||||
 | 
					    assert [name for name, _ in assigns_tensor] == ["c1", "c2", "c4"]
 | 
				
			||||||
 | 
					    requires_pos = get_requires_for_attr(nlp.pipeline, "token.pos")
 | 
				
			||||||
 | 
					    assert [name for name, _ in requires_pos] == ["c2"]
 | 
				
			||||||
 | 
					    assert print_summary(nlp, no_print=True)
 | 
				
			||||||
 | 
					    assert nlp("hello world")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_component_factories_from_nlp():
 | 
				
			||||||
 | 
					    """Test that class components can implement a from_nlp classmethod that
 | 
				
			||||||
 | 
					    gives them access to the nlp object and config via the factory."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    class TestComponent5(object):
 | 
				
			||||||
 | 
					        def __call__(self, doc):
 | 
				
			||||||
 | 
					            return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    mock = Mock()
 | 
				
			||||||
 | 
					    mock.return_value = TestComponent5()
 | 
				
			||||||
 | 
					    TestComponent5.from_nlp = classmethod(mock)
 | 
				
			||||||
 | 
					    TestComponent5 = component("c5")(TestComponent5)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert "c5" in Language.factories
 | 
				
			||||||
 | 
					    nlp = Language()
 | 
				
			||||||
 | 
					    pipe = nlp.create_pipe("c5", config={"foo": "bar"})
 | 
				
			||||||
 | 
					    nlp.add_pipe(pipe)
 | 
				
			||||||
 | 
					    assert nlp("hello world")
 | 
				
			||||||
 | 
					    # The first argument here is the class itself, so we're accepting any here
 | 
				
			||||||
 | 
					    mock.assert_called_once_with(ANY, nlp, foo="bar")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_analysis_validate_attrs_valid():
 | 
				
			||||||
 | 
					    attrs = ["doc.sents", "doc.ents", "token.tag", "token._.xyz"]
 | 
				
			||||||
 | 
					    assert validate_attrs(attrs)
 | 
				
			||||||
 | 
					    for attr in attrs:
 | 
				
			||||||
 | 
					        assert validate_attrs([attr])
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        validate_attrs(["doc.sents", "doc.xyz"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "attr",
 | 
				
			||||||
 | 
					    [
 | 
				
			||||||
 | 
					        "doc",
 | 
				
			||||||
 | 
					        "doc_ents",
 | 
				
			||||||
 | 
					        "doc.xyz",
 | 
				
			||||||
 | 
					        "token.xyz",
 | 
				
			||||||
 | 
					        "token.tag_",
 | 
				
			||||||
 | 
					        "token.tag.xyz",
 | 
				
			||||||
 | 
					        "token._.xyz.abc",
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def test_analysis_validate_attrs_invalid(attr):
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        validate_attrs([attr])
 | 
				
			||||||
| 
						 | 
					@ -247,6 +247,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
 | 
				
			||||||
    cls = get_lang_class(lang)
 | 
					    cls = get_lang_class(lang)
 | 
				
			||||||
    nlp = cls(meta=meta, **overrides)
 | 
					    nlp = cls(meta=meta, **overrides)
 | 
				
			||||||
    pipeline = meta.get("pipeline", [])
 | 
					    pipeline = meta.get("pipeline", [])
 | 
				
			||||||
 | 
					    factories = meta.get("factories", {})
 | 
				
			||||||
    disable = overrides.get("disable", [])
 | 
					    disable = overrides.get("disable", [])
 | 
				
			||||||
    if pipeline is True:
 | 
					    if pipeline is True:
 | 
				
			||||||
        pipeline = nlp.Defaults.pipe_names
 | 
					        pipeline = nlp.Defaults.pipe_names
 | 
				
			||||||
| 
						 | 
					@ -255,7 +256,8 @@ def load_model_from_path(model_path, meta=False, **overrides):
 | 
				
			||||||
    for name in pipeline:
 | 
					    for name in pipeline:
 | 
				
			||||||
        if name not in disable:
 | 
					        if name not in disable:
 | 
				
			||||||
            config = meta.get("pipeline_args", {}).get(name, {})
 | 
					            config = meta.get("pipeline_args", {}).get(name, {})
 | 
				
			||||||
            component = nlp.create_pipe(name, config=config)
 | 
					            factory = factories.get(name, name)
 | 
				
			||||||
 | 
					            component = nlp.create_pipe(factory, config=config)
 | 
				
			||||||
            nlp.add_pipe(component, name=name)
 | 
					            nlp.add_pipe(component, name=name)
 | 
				
			||||||
    return nlp.from_disk(model_path)
 | 
					    return nlp.from_disk(model_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -368,6 +370,16 @@ def is_in_jupyter():
 | 
				
			||||||
    return False
 | 
					    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_component_name(component):
 | 
				
			||||||
 | 
					    if hasattr(component, "name"):
 | 
				
			||||||
 | 
					        return component.name
 | 
				
			||||||
 | 
					    if hasattr(component, "__name__"):
 | 
				
			||||||
 | 
					        return component.__name__
 | 
				
			||||||
 | 
					    if hasattr(component, "__class__") and hasattr(component.__class__, "__name__"):
 | 
				
			||||||
 | 
					        return component.__class__.__name__
 | 
				
			||||||
 | 
					    return repr(component)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_cuda_stream(require=False):
 | 
					def get_cuda_stream(require=False):
 | 
				
			||||||
    if CudaStream is None:
 | 
					    if CudaStream is None:
 | 
				
			||||||
        return None
 | 
					        return None
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user