mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge
This commit is contained in:
		
						commit
						75a1569908
					
				| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
# fmt: off
 | 
					# fmt: off
 | 
				
			||||||
__title__ = "spacy-nightly"
 | 
					__title__ = "spacy-nightly"
 | 
				
			||||||
__version__ = "3.0.0a26"
 | 
					__version__ = "3.0.0a28"
 | 
				
			||||||
__release__ = True
 | 
					__release__ = True
 | 
				
			||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
					__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
				
			||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
					__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,6 +7,7 @@ import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..training.initialize import init_nlp, convert_vectors
 | 
					from ..training.initialize import init_nlp, convert_vectors
 | 
				
			||||||
 | 
					from ..language import Language
 | 
				
			||||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
 | 
					from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
 | 
				
			||||||
from ._util import import_code, setup_gpu
 | 
					from ._util import import_code, setup_gpu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -19,9 +20,9 @@ def init_vectors_cli(
 | 
				
			||||||
    output_dir: Path = Arg(..., help="Pipeline output directory"),
 | 
					    output_dir: Path = Arg(..., help="Pipeline output directory"),
 | 
				
			||||||
    prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
 | 
					    prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
 | 
				
			||||||
    truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
 | 
					    truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
 | 
				
			||||||
    jsonl_loc: Optional[Path]=Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file"),
 | 
					 | 
				
			||||||
    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
 | 
					    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
 | 
				
			||||||
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | 
					    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | 
				
			||||||
 | 
					    jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """Convert word vectors for use with spaCy. Will export an nlp object that
 | 
					    """Convert word vectors for use with spaCy. Will export an nlp object that
 | 
				
			||||||
| 
						 | 
					@ -32,12 +33,7 @@ def init_vectors_cli(
 | 
				
			||||||
    msg.info(f"Creating blank nlp object for language '{lang}'")
 | 
					    msg.info(f"Creating blank nlp object for language '{lang}'")
 | 
				
			||||||
    nlp = util.get_lang_class(lang)()
 | 
					    nlp = util.get_lang_class(lang)()
 | 
				
			||||||
    if jsonl_loc is not None:
 | 
					    if jsonl_loc is not None:
 | 
				
			||||||
        lex_attrs = srsly.read_jsonl(jsonl_loc)
 | 
					        update_lexemes(nlp, jsonl_loc)
 | 
				
			||||||
        for attrs in lex_attrs:
 | 
					 | 
				
			||||||
            if "settings" in attrs:
 | 
					 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            lexeme = nlp.vocab[attrs["orth"]]
 | 
					 | 
				
			||||||
            lexeme.set_attrs(**attrs)
 | 
					 | 
				
			||||||
    convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
 | 
					    convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
 | 
				
			||||||
    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
 | 
					    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
 | 
				
			||||||
    nlp.to_disk(output_dir)
 | 
					    nlp.to_disk(output_dir)
 | 
				
			||||||
| 
						 | 
					@ -48,6 +44,16 @@ def init_vectors_cli(
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
 | 
				
			||||||
 | 
					    # Mostly used for backwards-compatibility and may be removed in the future
 | 
				
			||||||
 | 
					    lex_attrs = srsly.read_jsonl(jsonl_loc)
 | 
				
			||||||
 | 
					    for attrs in lex_attrs:
 | 
				
			||||||
 | 
					        if "settings" in attrs:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        lexeme = nlp.vocab[attrs["orth"]]
 | 
				
			||||||
 | 
					        lexeme.set_attrs(**attrs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@init_cli.command(
 | 
					@init_cli.command(
 | 
				
			||||||
    "nlp",
 | 
					    "nlp",
 | 
				
			||||||
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 | 
					    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 | 
				
			||||||
| 
						 | 
					@ -89,7 +95,7 @@ def init_labels_cli(
 | 
				
			||||||
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
 | 
					    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """Generate a JSON file for labels in the data. This helps speed up the
 | 
					    """Generate JSON files for the labels in the data. This helps speed up the
 | 
				
			||||||
    training process, since spaCy won't have to preprocess the data to
 | 
					    training process, since spaCy won't have to preprocess the data to
 | 
				
			||||||
    extract the labels."""
 | 
					    extract the labels."""
 | 
				
			||||||
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
 | 
					    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,7 +2,6 @@
 | 
				
			||||||
train = null
 | 
					train = null
 | 
				
			||||||
dev = null
 | 
					dev = null
 | 
				
			||||||
vectors = null
 | 
					vectors = null
 | 
				
			||||||
vocab_data = null
 | 
					 | 
				
			||||||
init_tok2vec = null
 | 
					init_tok2vec = null
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[system]
 | 
					[system]
 | 
				
			||||||
| 
						 | 
					@ -11,8 +10,13 @@ gpu_allocator = null
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp]
 | 
					[nlp]
 | 
				
			||||||
lang = null
 | 
					lang = null
 | 
				
			||||||
 | 
					# List of pipeline component names, in order. The names should correspond to
 | 
				
			||||||
 | 
					# components defined in the [components block]
 | 
				
			||||||
pipeline = []
 | 
					pipeline = []
 | 
				
			||||||
 | 
					# Components that are loaded but disabled by default
 | 
				
			||||||
disabled = []
 | 
					disabled = []
 | 
				
			||||||
 | 
					# Optional callbacks to modify the nlp object before it's initialized, after
 | 
				
			||||||
 | 
					# it's created and after the pipeline has been set up
 | 
				
			||||||
before_creation = null
 | 
					before_creation = null
 | 
				
			||||||
after_creation = null
 | 
					after_creation = null
 | 
				
			||||||
after_pipeline_creation = null
 | 
					after_pipeline_creation = null
 | 
				
			||||||
| 
						 | 
					@ -20,6 +24,7 @@ after_pipeline_creation = null
 | 
				
			||||||
[nlp.tokenizer]
 | 
					[nlp.tokenizer]
 | 
				
			||||||
@tokenizers = "spacy.Tokenizer.v1"
 | 
					@tokenizers = "spacy.Tokenizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# The pipeline components and their models
 | 
				
			||||||
[components]
 | 
					[components]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Readers for corpora like dev and train.
 | 
					# Readers for corpora like dev and train.
 | 
				
			||||||
| 
						 | 
					@ -38,8 +43,7 @@ max_length = 0
 | 
				
			||||||
limit = 0
 | 
					limit = 0
 | 
				
			||||||
# Apply some simply data augmentation, where we replace tokens with variations.
 | 
					# Apply some simply data augmentation, where we replace tokens with variations.
 | 
				
			||||||
# This is especially useful for punctuation and case replacement, to help
 | 
					# This is especially useful for punctuation and case replacement, to help
 | 
				
			||||||
# generalize beyond corpora that don't have smart-quotes, or only have smart
 | 
					# generalize beyond corpora that don't/only have smart quotes etc.
 | 
				
			||||||
# quotes, etc.
 | 
					 | 
				
			||||||
augmenter = null
 | 
					augmenter = null
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[corpora.dev]
 | 
					[corpora.dev]
 | 
				
			||||||
| 
						 | 
					@ -53,6 +57,7 @@ gold_preproc = false
 | 
				
			||||||
max_length = 0
 | 
					max_length = 0
 | 
				
			||||||
# Limitation on number of training examples
 | 
					# Limitation on number of training examples
 | 
				
			||||||
limit = 0
 | 
					limit = 0
 | 
				
			||||||
 | 
					# Optional callback for data augmentation
 | 
				
			||||||
augmenter = null
 | 
					augmenter = null
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Training hyper-parameters and additional features.
 | 
					# Training hyper-parameters and additional features.
 | 
				
			||||||
| 
						 | 
					@ -102,17 +107,18 @@ use_averages = false
 | 
				
			||||||
eps = 1e-8
 | 
					eps = 1e-8
 | 
				
			||||||
learn_rate = 0.001
 | 
					learn_rate = 0.001
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# The 'initialize' step is run before training or pretraining. Components and
 | 
					# These settings are used when nlp.initialize() is called (typically before
 | 
				
			||||||
# the tokenizer can each define their own arguments via their .initialize
 | 
					# training or pretraining). Components and the tokenizer can each define their
 | 
				
			||||||
# methods that are populated by the config. This lets them gather resources like
 | 
					# own arguments via their initialize methods that are populated by the config.
 | 
				
			||||||
# lookup tables and build label sets, construct vocabularies, etc.
 | 
					# This lets them gather data resources, build label sets etc.
 | 
				
			||||||
[initialize]
 | 
					[initialize]
 | 
				
			||||||
vocab_data = ${paths.vocab_data}
 | 
					 | 
				
			||||||
lookups = null
 | 
					 | 
				
			||||||
vectors = ${paths.vectors}
 | 
					vectors = ${paths.vectors}
 | 
				
			||||||
# Extra resources for transfer-learning or pseudo-rehearsal
 | 
					# Extra resources for transfer-learning or pseudo-rehearsal
 | 
				
			||||||
init_tok2vec = ${paths.init_tok2vec}
 | 
					init_tok2vec = ${paths.init_tok2vec}
 | 
				
			||||||
 | 
					# Data and lookups for vocabulary
 | 
				
			||||||
 | 
					vocab_data = null
 | 
				
			||||||
 | 
					lookups = null
 | 
				
			||||||
# Arguments passed to the tokenizer's initialize method
 | 
					# Arguments passed to the tokenizer's initialize method
 | 
				
			||||||
tokenizer = {}
 | 
					tokenizer = {}
 | 
				
			||||||
# Arguments passed to the initialize methods of the components (keyed by component name)
 | 
					# Arguments for initialize methods of the components (keyed by component)
 | 
				
			||||||
components = {}
 | 
					components = {}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -710,6 +710,9 @@ class Errors:
 | 
				
			||||||
             "options: {modes}")
 | 
					             "options: {modes}")
 | 
				
			||||||
    E1012 = ("Entity spans and blocked/missing/outside spans should be "
 | 
					    E1012 = ("Entity spans and blocked/missing/outside spans should be "
 | 
				
			||||||
             "provided to doc.set_ents as lists of `Span` objects.")
 | 
					             "provided to doc.set_ents as lists of `Span` objects.")
 | 
				
			||||||
 | 
					    E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
 | 
				
			||||||
 | 
					             "token itself. To set the morph from this MorphAnalysis, set from "
 | 
				
			||||||
 | 
					             "the string value with: `token.set_morph(str(other_morph))`.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@add_codes
 | 
					@add_codes
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import load_config_from_str
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[initialize]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[initialize.lookups]
 | 
					 | 
				
			||||||
@misc = "spacy.LookupsDataLoader.v1"
 | 
					 | 
				
			||||||
lang = ${nlp.lang}
 | 
					 | 
				
			||||||
tables = ["lexeme_norm"]
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class DanishDefaults(Language.Defaults):
 | 
					class DanishDefaults(Language.Defaults):
 | 
				
			||||||
    config = load_config_from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import load_config_from_str
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[initialize]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[initialize.lookups]
 | 
					 | 
				
			||||||
@misc = "spacy.LookupsDataLoader.v1"
 | 
					 | 
				
			||||||
lang = ${nlp.lang}
 | 
					 | 
				
			||||||
tables = ["lexeme_norm"]
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class GermanDefaults(Language.Defaults):
 | 
					class GermanDefaults(Language.Defaults):
 | 
				
			||||||
    config = load_config_from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,21 +9,9 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
 | 
				
			||||||
from .lemmatizer import GreekLemmatizer
 | 
					from .lemmatizer import GreekLemmatizer
 | 
				
			||||||
from ...lookups import Lookups
 | 
					from ...lookups import Lookups
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import load_config_from_str
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[initialize]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[initialize.lookups]
 | 
					 | 
				
			||||||
@misc = "spacy.LookupsDataLoader.v1"
 | 
					 | 
				
			||||||
lang = ${nlp.lang}
 | 
					 | 
				
			||||||
tables = ["lexeme_norm"]
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class GreekDefaults(Language.Defaults):
 | 
					class GreekDefaults(Language.Defaults):
 | 
				
			||||||
    config = load_config_from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,21 +4,9 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import load_config_from_str
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[initialize]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[initialize.lookups]
 | 
					 | 
				
			||||||
@misc = "spacy.LookupsDataLoader.v1"
 | 
					 | 
				
			||||||
lang = ${nlp.lang}
 | 
					 | 
				
			||||||
tables = ["lexeme_norm"]
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class IndonesianDefaults(Language.Defaults):
 | 
					class IndonesianDefaults(Language.Defaults):
 | 
				
			||||||
    config = load_config_from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_INFIXES
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import load_config_from_str
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[initialize]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[initialize.lookups]
 | 
					 | 
				
			||||||
@misc = "spacy.LookupsDataLoader.v1"
 | 
					 | 
				
			||||||
lang = ${nlp.lang}
 | 
					 | 
				
			||||||
tables = ["lexeme_norm"]
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class LuxembourgishDefaults(Language.Defaults):
 | 
					class LuxembourgishDefaults(Language.Defaults):
 | 
				
			||||||
    config = load_config_from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    lex_attr_getters = LEX_ATTRS
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,21 +3,9 @@ from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import load_config_from_str
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[initialize]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[initialize.lookups]
 | 
					 | 
				
			||||||
@misc = "spacy.LookupsDataLoader.v1"
 | 
					 | 
				
			||||||
lang = ${nlp.lang}
 | 
					 | 
				
			||||||
tables = ["lexeme_norm"]
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class PortugueseDefaults(Language.Defaults):
 | 
					class PortugueseDefaults(Language.Defaults):
 | 
				
			||||||
    config = load_config_from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,21 +7,9 @@ from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .lemmatizer import RussianLemmatizer
 | 
					from .lemmatizer import RussianLemmatizer
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...lookups import Lookups
 | 
					from ...lookups import Lookups
 | 
				
			||||||
from ...util import load_config_from_str
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[initialize]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[initialize.lookups]
 | 
					 | 
				
			||||||
@misc = "spacy.LookupsDataLoader.v1"
 | 
					 | 
				
			||||||
lang = ${nlp.lang}
 | 
					 | 
				
			||||||
tables = ["lexeme_norm"]
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class RussianDefaults(Language.Defaults):
 | 
					class RussianDefaults(Language.Defaults):
 | 
				
			||||||
    config = load_config_from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    lex_attr_getters = LEX_ATTRS
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,21 +2,9 @@ from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import load_config_from_str
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[initialize]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[initialize.lookups]
 | 
					 | 
				
			||||||
@misc = "spacy.LookupsDataLoader.v1"
 | 
					 | 
				
			||||||
lang = ${nlp.lang}
 | 
					 | 
				
			||||||
tables = ["lexeme_norm"]
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class SerbianDefaults(Language.Defaults):
 | 
					class SerbianDefaults(Language.Defaults):
 | 
				
			||||||
    config = load_config_from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    lex_attr_getters = LEX_ATTRS
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,21 +1,9 @@
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import load_config_from_str
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[initialize]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[initialize.lookups]
 | 
					 | 
				
			||||||
@misc = "spacy.LookupsDataLoader.v1"
 | 
					 | 
				
			||||||
lang = ${nlp.lang}
 | 
					 | 
				
			||||||
tables = ["lexeme_norm"]
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class TamilDefaults(Language.Defaults):
 | 
					class TamilDefaults(Language.Defaults):
 | 
				
			||||||
    config = load_config_from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
    lex_attr_getters = LEX_ATTRS
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,13 +10,6 @@ DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.tokenizer]
 | 
					[nlp.tokenizer]
 | 
				
			||||||
@tokenizers = "spacy.th.ThaiTokenizer"
 | 
					@tokenizers = "spacy.th.ThaiTokenizer"
 | 
				
			||||||
 | 
					 | 
				
			||||||
[initialize]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[initialize.lookups]
 | 
					 | 
				
			||||||
@misc = "spacy.LookupsDataLoader.v1"
 | 
					 | 
				
			||||||
lang = ${nlp.lang}
 | 
					 | 
				
			||||||
tables = ["lexeme_norm"]
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										25
									
								
								spacy/ml/featureextractor.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								spacy/ml/featureextractor.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,25 @@
 | 
				
			||||||
 | 
					from typing import List, Union, Callable, Tuple
 | 
				
			||||||
 | 
					from thinc.types import Ints2d, Doc
 | 
				
			||||||
 | 
					from thinc.api import Model, registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.layers("spacy.FeatureExtractor.v1")
 | 
				
			||||||
 | 
					def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
 | 
				
			||||||
 | 
					    return Model("extract_features", forward, attrs={"columns": columns})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def forward(model: Model[List[Doc], List[Ints2d]], docs, is_train: bool) -> Tuple[List[Ints2d], Callable]:
 | 
				
			||||||
 | 
					    columns = model.attrs["columns"]
 | 
				
			||||||
 | 
					    features: List[Ints2d] = []
 | 
				
			||||||
 | 
					    for doc in docs:
 | 
				
			||||||
 | 
					        if hasattr(doc, "to_array"):
 | 
				
			||||||
 | 
					            attrs = doc.to_array(columns)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            attrs = doc.doc.to_array(columns)[doc.start : doc.end]
 | 
				
			||||||
 | 
					        if attrs.ndim == 1:
 | 
				
			||||||
 | 
					            attrs = attrs.reshape((attrs.shape[0], 1))
 | 
				
			||||||
 | 
					        features.append(model.ops.asarray2i(attrs, dtype="uint64"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
 | 
				
			||||||
 | 
					    return features, backprop
 | 
				
			||||||
| 
						 | 
					@ -3,12 +3,13 @@ from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
 | 
				
			||||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
 | 
					from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
 | 
				
			||||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
 | 
					from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
 | 
				
			||||||
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
 | 
					from thinc.api import HashEmbed, with_array, with_cpu, uniqued
 | 
				
			||||||
from thinc.api import Relu, residual, expand_window, FeatureExtractor
 | 
					from thinc.api import Relu, residual, expand_window
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
 | 
					from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
 | 
				
			||||||
from ...util import registry
 | 
					from ...util import registry
 | 
				
			||||||
from ..extract_ngrams import extract_ngrams
 | 
					from ..extract_ngrams import extract_ngrams
 | 
				
			||||||
from ..staticvectors import StaticVectors
 | 
					from ..staticvectors import StaticVectors
 | 
				
			||||||
 | 
					from ..featureextractor import FeatureExtractor
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.architectures.register("spacy.TextCatCNN.v1")
 | 
					@registry.architectures.register("spacy.TextCatCNN.v1")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,16 +1,16 @@
 | 
				
			||||||
from typing import Optional, List, Union
 | 
					from typing import Optional, List, Union
 | 
				
			||||||
from thinc.api import chain, clone, concatenate, with_array, with_padded
 | 
					 | 
				
			||||||
from thinc.api import Model, noop, list2ragged, ragged2list
 | 
					 | 
				
			||||||
from thinc.api import FeatureExtractor, HashEmbed
 | 
					 | 
				
			||||||
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
 | 
					 | 
				
			||||||
from thinc.types import Floats2d
 | 
					from thinc.types import Floats2d
 | 
				
			||||||
 | 
					from thinc.api import chain, clone, concatenate, with_array, with_padded
 | 
				
			||||||
 | 
					from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
 | 
				
			||||||
 | 
					from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...tokens import Doc
 | 
					from ...tokens import Doc
 | 
				
			||||||
from ...util import registry
 | 
					from ...util import registry
 | 
				
			||||||
from ...ml import _character_embed
 | 
					from ...ml import _character_embed
 | 
				
			||||||
from ..staticvectors import StaticVectors
 | 
					from ..staticvectors import StaticVectors
 | 
				
			||||||
 | 
					from ..featureextractor import FeatureExtractor
 | 
				
			||||||
from ...pipeline.tok2vec import Tok2VecListener
 | 
					from ...pipeline.tok2vec import Tok2VecListener
 | 
				
			||||||
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
 | 
					from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.architectures.register("spacy.Tok2VecListener.v1")
 | 
					@registry.architectures.register("spacy.Tok2VecListener.v1")
 | 
				
			||||||
| 
						 | 
					@ -98,7 +98,7 @@ def MultiHashEmbed(
 | 
				
			||||||
    attributes using hash embedding, concatenates the results, and passes it
 | 
					    attributes using hash embedding, concatenates the results, and passes it
 | 
				
			||||||
    through a feed-forward subnetwork to build a mixed representations.
 | 
					    through a feed-forward subnetwork to build a mixed representations.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have
 | 
					    The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
 | 
				
			||||||
    varying definitions depending on the Vocab of the Doc object passed in.
 | 
					    varying definitions depending on the Vocab of the Doc object passed in.
 | 
				
			||||||
    Vectors from pretrained static vectors can also be incorporated into the
 | 
					    Vectors from pretrained static vectors can also be incorporated into the
 | 
				
			||||||
    concatenated representation.
 | 
					    concatenated representation.
 | 
				
			||||||
| 
						 | 
					@ -115,7 +115,7 @@ def MultiHashEmbed(
 | 
				
			||||||
    also_use_static_vectors (bool): Whether to also use static word vectors.
 | 
					    also_use_static_vectors (bool): Whether to also use static word vectors.
 | 
				
			||||||
        Requires a vectors table to be loaded in the Doc objects' vocab.
 | 
					        Requires a vectors table to be loaded in the Doc objects' vocab.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
 | 
					    cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
 | 
				
			||||||
    seed = 7
 | 
					    seed = 7
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def make_hash_embed(feature):
 | 
					    def make_hash_embed(feature):
 | 
				
			||||||
| 
						 | 
					@ -123,7 +123,7 @@ def MultiHashEmbed(
 | 
				
			||||||
        seed += 1
 | 
					        seed += 1
 | 
				
			||||||
        return HashEmbed(
 | 
					        return HashEmbed(
 | 
				
			||||||
            width,
 | 
					            width,
 | 
				
			||||||
            rows if feature == NORM else rows // 2,
 | 
					            rows if feature == LOWER else rows // 2,
 | 
				
			||||||
            column=cols.index(feature),
 | 
					            column=cols.index(feature),
 | 
				
			||||||
            seed=seed,
 | 
					            seed=seed,
 | 
				
			||||||
            dropout=0.0,
 | 
					            dropout=0.0,
 | 
				
			||||||
| 
						 | 
					@ -131,13 +131,13 @@ def MultiHashEmbed(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if also_embed_subwords:
 | 
					    if also_embed_subwords:
 | 
				
			||||||
        embeddings = [
 | 
					        embeddings = [
 | 
				
			||||||
            make_hash_embed(NORM),
 | 
					            make_hash_embed(LOWER),
 | 
				
			||||||
            make_hash_embed(PREFIX),
 | 
					            make_hash_embed(PREFIX),
 | 
				
			||||||
            make_hash_embed(SUFFIX),
 | 
					            make_hash_embed(SUFFIX),
 | 
				
			||||||
            make_hash_embed(SHAPE),
 | 
					            make_hash_embed(SHAPE),
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        embeddings = [make_hash_embed(NORM)]
 | 
					        embeddings = [make_hash_embed(LOWER)]
 | 
				
			||||||
    concat_size = width * (len(embeddings) + also_use_static_vectors)
 | 
					    concat_size = width * (len(embeddings) + also_use_static_vectors)
 | 
				
			||||||
    if also_use_static_vectors:
 | 
					    if also_use_static_vectors:
 | 
				
			||||||
        model = chain(
 | 
					        model = chain(
 | 
				
			||||||
| 
						 | 
					@ -180,13 +180,17 @@ def CharacterEmbed(
 | 
				
			||||||
    of being in an arbitrary position depending on the word length.
 | 
					    of being in an arbitrary position depending on the word length.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    The characters are embedded in a embedding table with a given number of rows,
 | 
					    The characters are embedded in a embedding table with a given number of rows,
 | 
				
			||||||
    and the vectors concatenated. A hash-embedded vector of the NORM of the word is
 | 
					    and the vectors concatenated. A hash-embedded vector of the LOWER of the word is
 | 
				
			||||||
    also concatenated on, and the result is then passed through a feed-forward
 | 
					    also concatenated on, and the result is then passed through a feed-forward
 | 
				
			||||||
    network to construct a single vector to represent the information.
 | 
					    network to construct a single vector to represent the information.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    feature (int or str): An attribute to embed, to concatenate with the characters.
 | 
					    feature (int or str): An attribute to embed, to concatenate with the characters.
 | 
				
			||||||
    width (int): The width of the output vector and the feature embedding.
 | 
					    width (int): The width of the output vector and the feature embedding.
 | 
				
			||||||
 | 
					<<<<<<< HEAD
 | 
				
			||||||
    rows (int): The number of rows in the NORM hash embedding table.
 | 
					    rows (int): The number of rows in the NORM hash embedding table.
 | 
				
			||||||
 | 
					=======
 | 
				
			||||||
 | 
					    rows (int): The number of rows in the LOWER hash embedding table.
 | 
				
			||||||
 | 
					>>>>>>> 300e5a9928fd226dfddbf7d5c22558f696bfa1af
 | 
				
			||||||
    nM (int): The dimensionality of the character embeddings. Recommended values
 | 
					    nM (int): The dimensionality of the character embeddings. Recommended values
 | 
				
			||||||
        are between 16 and 64.
 | 
					        are between 16 and 64.
 | 
				
			||||||
    nC (int): The number of UTF-8 bytes to embed per word. Recommended values
 | 
					    nC (int): The number of UTF-8 bytes to embed per word. Recommended values
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -149,7 +149,7 @@ class Morphologizer(Tagger):
 | 
				
			||||||
        for example in get_examples():
 | 
					        for example in get_examples():
 | 
				
			||||||
            for i, token in enumerate(example.reference):
 | 
					            for i, token in enumerate(example.reference):
 | 
				
			||||||
                pos = token.pos_
 | 
					                pos = token.pos_
 | 
				
			||||||
                morph = token.morph_
 | 
					                morph = str(token.morph)
 | 
				
			||||||
                # create and add the combined morph+POS label
 | 
					                # create and add the combined morph+POS label
 | 
				
			||||||
                morph_dict = Morphology.feats_to_dict(morph)
 | 
					                morph_dict = Morphology.feats_to_dict(morph)
 | 
				
			||||||
                if pos:
 | 
					                if pos:
 | 
				
			||||||
| 
						 | 
					@ -167,7 +167,7 @@ class Morphologizer(Tagger):
 | 
				
			||||||
            gold_array = []
 | 
					            gold_array = []
 | 
				
			||||||
            for i, token in enumerate(example.reference):
 | 
					            for i, token in enumerate(example.reference):
 | 
				
			||||||
                pos = token.pos_
 | 
					                pos = token.pos_
 | 
				
			||||||
                morph = token.morph_
 | 
					                morph = str(token.morph)
 | 
				
			||||||
                morph_dict = Morphology.feats_to_dict(morph)
 | 
					                morph_dict = Morphology.feats_to_dict(morph)
 | 
				
			||||||
                if pos:
 | 
					                if pos:
 | 
				
			||||||
                    morph_dict[self.POS_FEAT] = pos
 | 
					                    morph_dict[self.POS_FEAT] = pos
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -268,6 +268,9 @@ class Tagger(Pipe):
 | 
				
			||||||
        get_examples (Callable[[], Iterable[Example]]): Function that
 | 
					        get_examples (Callable[[], Iterable[Example]]): Function that
 | 
				
			||||||
            returns a representative sample of gold-standard Example objects..
 | 
					            returns a representative sample of gold-standard Example objects..
 | 
				
			||||||
        nlp (Language): The current nlp object the component is part of.
 | 
					        nlp (Language): The current nlp object the component is part of.
 | 
				
			||||||
 | 
					        labels: The labels to add to the component, typically generated by the
 | 
				
			||||||
 | 
					            `init labels` command. If no labels are provided, the get_examples
 | 
				
			||||||
 | 
					            callback is used to extract the labels from the data.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://nightly.spacy.io/api/tagger#initialize
 | 
					        DOCS: https://nightly.spacy.io/api/tagger#initialize
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -355,6 +355,9 @@ class TextCategorizer(Pipe):
 | 
				
			||||||
        get_examples (Callable[[], Iterable[Example]]): Function that
 | 
					        get_examples (Callable[[], Iterable[Example]]): Function that
 | 
				
			||||||
            returns a representative sample of gold-standard Example objects.
 | 
					            returns a representative sample of gold-standard Example objects.
 | 
				
			||||||
        nlp (Language): The current nlp object the component is part of.
 | 
					        nlp (Language): The current nlp object the component is part of.
 | 
				
			||||||
 | 
					        labels: The labels to add to the component, typically generated by the
 | 
				
			||||||
 | 
					            `init labels` command. If no labels are provided, the get_examples
 | 
				
			||||||
 | 
					            callback is used to extract the labels from the data.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
 | 
					        DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -46,9 +46,9 @@ def test_doc_array_morph(en_vocab):
 | 
				
			||||||
    words = ["Eat", "blue", "ham"]
 | 
					    words = ["Eat", "blue", "ham"]
 | 
				
			||||||
    morph = ["Feat=V", "Feat=J", "Feat=N"]
 | 
					    morph = ["Feat=V", "Feat=J", "Feat=N"]
 | 
				
			||||||
    doc = Doc(en_vocab, words=words, morphs=morph)
 | 
					    doc = Doc(en_vocab, words=words, morphs=morph)
 | 
				
			||||||
    assert morph[0] == doc[0].morph_
 | 
					    assert morph[0] == str(doc[0].morph)
 | 
				
			||||||
    assert morph[1] == doc[1].morph_
 | 
					    assert morph[1] == str(doc[1].morph)
 | 
				
			||||||
    assert morph[2] == doc[2].morph_
 | 
					    assert morph[2] == str(doc[2].morph)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    feats_array = doc.to_array((ORTH, MORPH))
 | 
					    feats_array = doc.to_array((ORTH, MORPH))
 | 
				
			||||||
    assert feats_array[0][1] == doc[0].morph.key
 | 
					    assert feats_array[0][1] == doc[0].morph.key
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -319,15 +319,13 @@ def test_doc_from_array_morph(en_vocab):
 | 
				
			||||||
    words = ["I", "live", "in", "New", "York", "."]
 | 
					    words = ["I", "live", "in", "New", "York", "."]
 | 
				
			||||||
    morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
 | 
					    morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
    doc = Doc(en_vocab, words=words)
 | 
					    doc = Doc(en_vocab, words=words, morphs=morphs)
 | 
				
			||||||
    for i, morph in enumerate(morphs):
 | 
					 | 
				
			||||||
        doc[i].morph_ = morph
 | 
					 | 
				
			||||||
    attrs = [MORPH]
 | 
					    attrs = [MORPH]
 | 
				
			||||||
    arr = doc.to_array(attrs)
 | 
					    arr = doc.to_array(attrs)
 | 
				
			||||||
    new_doc = Doc(en_vocab, words=words)
 | 
					    new_doc = Doc(en_vocab, words=words)
 | 
				
			||||||
    new_doc.from_array(attrs, arr)
 | 
					    new_doc.from_array(attrs, arr)
 | 
				
			||||||
    assert [t.morph_ for t in new_doc] == morphs
 | 
					    assert [str(t.morph) for t in new_doc] == morphs
 | 
				
			||||||
    assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc]
 | 
					    assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
 | 
					def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
 | 
				
			||||||
| 
						 | 
					@ -423,7 +421,7 @@ def test_has_annotation(en_vocab):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    doc[0].tag_ = "A"
 | 
					    doc[0].tag_ = "A"
 | 
				
			||||||
    doc[0].pos_ = "X"
 | 
					    doc[0].pos_ = "X"
 | 
				
			||||||
    doc[0].morph_ = "Feat=Val"
 | 
					    doc[0].set_morph("Feat=Val")
 | 
				
			||||||
    doc[0].lemma_ = "a"
 | 
					    doc[0].lemma_ = "a"
 | 
				
			||||||
    doc[0].dep_ = "dep"
 | 
					    doc[0].dep_ = "dep"
 | 
				
			||||||
    doc[0].head = doc[1]
 | 
					    doc[0].head = doc[1]
 | 
				
			||||||
| 
						 | 
					@ -435,7 +433,7 @@ def test_has_annotation(en_vocab):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    doc[1].tag_ = "A"
 | 
					    doc[1].tag_ = "A"
 | 
				
			||||||
    doc[1].pos_ = "X"
 | 
					    doc[1].pos_ = "X"
 | 
				
			||||||
    doc[1].morph_ = ""
 | 
					    doc[1].set_morph("")
 | 
				
			||||||
    doc[1].lemma_ = "a"
 | 
					    doc[1].lemma_ = "a"
 | 
				
			||||||
    doc[1].dep_ = "dep"
 | 
					    doc[1].dep_ = "dep"
 | 
				
			||||||
    doc.ents = [Span(doc, 0, 2, label="HELLO")]
 | 
					    doc.ents = [Span(doc, 0, 2, label="HELLO")]
 | 
				
			||||||
| 
						 | 
					@ -533,5 +531,78 @@ def test_doc_ents_setter():
 | 
				
			||||||
    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
 | 
					    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
 | 
				
			||||||
    vocab = Vocab()
 | 
					    vocab = Vocab()
 | 
				
			||||||
    ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
 | 
					    ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
 | 
				
			||||||
 | 
					    ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"]
 | 
				
			||||||
    doc = Doc(vocab, words=words, ents=ents)
 | 
					    doc = Doc(vocab, words=words, ents=ents)
 | 
				
			||||||
    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
 | 
					    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_doc_morph_setter(en_tokenizer, de_tokenizer):
 | 
				
			||||||
 | 
					    doc1 = en_tokenizer("a b")
 | 
				
			||||||
 | 
					    doc1b = en_tokenizer("c d")
 | 
				
			||||||
 | 
					    doc2 = de_tokenizer("a b")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # unset values can be copied
 | 
				
			||||||
 | 
					    doc1[0].morph = doc1[1].morph
 | 
				
			||||||
 | 
					    assert doc1[0].morph.key == 0
 | 
				
			||||||
 | 
					    assert doc1[1].morph.key == 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # morph values from the same vocab can be copied
 | 
				
			||||||
 | 
					    doc1[0].set_morph("Feat=Val")
 | 
				
			||||||
 | 
					    doc1[1].morph = doc1[0].morph
 | 
				
			||||||
 | 
					    assert doc1[0].morph == doc1[1].morph
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # ... also across docs
 | 
				
			||||||
 | 
					    doc1b[0].morph = doc1[0].morph
 | 
				
			||||||
 | 
					    assert doc1[0].morph == doc1b[0].morph
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    doc2[0].set_morph("Feat2=Val2")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # the morph value must come from the same vocab
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        doc1[0].morph = doc2[0].morph
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_doc_init_iob():
 | 
				
			||||||
 | 
					    """Test ents validation/normalization in Doc.__init__"""
 | 
				
			||||||
 | 
					    words = ["a", "b", "c", "d", "e"]
 | 
				
			||||||
 | 
					    ents = ["O"] * len(words)
 | 
				
			||||||
 | 
					    doc = Doc(Vocab(), words=words, ents=ents)
 | 
				
			||||||
 | 
					    assert doc.ents == ()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-PERSON"]
 | 
				
			||||||
 | 
					    doc = Doc(Vocab(), words=words, ents=ents)
 | 
				
			||||||
 | 
					    assert len(doc.ents) == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
 | 
				
			||||||
 | 
					    doc = Doc(Vocab(), words=words, ents=ents)
 | 
				
			||||||
 | 
					    assert len(doc.ents) == 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # None is missing
 | 
				
			||||||
 | 
					    ents = ["B-PERSON", "I-PERSON", "O", None, "I-GPE"]
 | 
				
			||||||
 | 
					    doc = Doc(Vocab(), words=words, ents=ents)
 | 
				
			||||||
 | 
					    assert len(doc.ents) == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # empty tag is missing
 | 
				
			||||||
 | 
					    ents = ["", "B-PERSON", "O", "B-PERSON", "I-PERSON"]
 | 
				
			||||||
 | 
					    doc = Doc(Vocab(), words=words, ents=ents)
 | 
				
			||||||
 | 
					    assert len(doc.ents) == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # invalid IOB
 | 
				
			||||||
 | 
					    ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        doc = Doc(Vocab(), words=words, ents=ents)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # no dash
 | 
				
			||||||
 | 
					    ents = ["OPERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        doc = Doc(Vocab(), words=words, ents=ents)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # no ent type
 | 
				
			||||||
 | 
					    ents = ["O", "B-", "O", "I-PERSON", "I-GPE"]
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        doc = Doc(Vocab(), words=words, ents=ents)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # not strings or None
 | 
				
			||||||
 | 
					    ents = [0, "B-", "O", "I-PERSON", "I-GPE"]
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        doc = Doc(Vocab(), words=words, ents=ents)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,13 +4,13 @@ import pytest
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def i_has(en_tokenizer):
 | 
					def i_has(en_tokenizer):
 | 
				
			||||||
    doc = en_tokenizer("I has")
 | 
					    doc = en_tokenizer("I has")
 | 
				
			||||||
    doc[0].morph_ = {"PronType": "prs"}
 | 
					    doc[0].set_morph({"PronType": "prs"})
 | 
				
			||||||
    doc[1].morph_ = {
 | 
					    doc[1].set_morph({
 | 
				
			||||||
        "VerbForm": "fin",
 | 
					        "VerbForm": "fin",
 | 
				
			||||||
        "Tense": "pres",
 | 
					        "Tense": "pres",
 | 
				
			||||||
        "Number": "sing",
 | 
					        "Number": "sing",
 | 
				
			||||||
        "Person": "three",
 | 
					        "Person": "three",
 | 
				
			||||||
    }
 | 
					    })
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return doc
 | 
					    return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -47,20 +47,20 @@ def test_morph_get(i_has):
 | 
				
			||||||
def test_morph_set(i_has):
 | 
					def test_morph_set(i_has):
 | 
				
			||||||
    assert i_has[0].morph.get("PronType") == ["prs"]
 | 
					    assert i_has[0].morph.get("PronType") == ["prs"]
 | 
				
			||||||
    # set by string
 | 
					    # set by string
 | 
				
			||||||
    i_has[0].morph_ = "PronType=unk"
 | 
					    i_has[0].set_morph("PronType=unk")
 | 
				
			||||||
    assert i_has[0].morph.get("PronType") == ["unk"]
 | 
					    assert i_has[0].morph.get("PronType") == ["unk"]
 | 
				
			||||||
    # set by string, fields are alphabetized
 | 
					    # set by string, fields are alphabetized
 | 
				
			||||||
    i_has[0].morph_ = "PronType=123|NounType=unk"
 | 
					    i_has[0].set_morph("PronType=123|NounType=unk")
 | 
				
			||||||
    assert i_has[0].morph_ == "NounType=unk|PronType=123"
 | 
					    assert str(i_has[0].morph) == "NounType=unk|PronType=123"
 | 
				
			||||||
    # set by dict
 | 
					    # set by dict
 | 
				
			||||||
    i_has[0].morph_ = {"AType": "123", "BType": "unk"}
 | 
					    i_has[0].set_morph({"AType": "123", "BType": "unk"})
 | 
				
			||||||
    assert i_has[0].morph_ == "AType=123|BType=unk"
 | 
					    assert str(i_has[0].morph) == "AType=123|BType=unk"
 | 
				
			||||||
    # set by string with multiple values, fields and values are alphabetized
 | 
					    # set by string with multiple values, fields and values are alphabetized
 | 
				
			||||||
    i_has[0].morph_ = "BType=c|AType=b,a"
 | 
					    i_has[0].set_morph("BType=c|AType=b,a")
 | 
				
			||||||
    assert i_has[0].morph_ == "AType=a,b|BType=c"
 | 
					    assert str(i_has[0].morph) == "AType=a,b|BType=c"
 | 
				
			||||||
    # set by dict with multiple values, fields and values are alphabetized
 | 
					    # set by dict with multiple values, fields and values are alphabetized
 | 
				
			||||||
    i_has[0].morph_ = {"AType": "b,a", "BType": "c"}
 | 
					    i_has[0].set_morph({"AType": "b,a", "BType": "c"})
 | 
				
			||||||
    assert i_has[0].morph_ == "AType=a,b|BType=c"
 | 
					    assert str(i_has[0].morph) == "AType=a,b|BType=c"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_morph_str(i_has):
 | 
					def test_morph_str(i_has):
 | 
				
			||||||
| 
						 | 
					@ -72,25 +72,25 @@ def test_morph_property(tokenizer):
 | 
				
			||||||
    doc = tokenizer("a dog")
 | 
					    doc = tokenizer("a dog")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # set through token.morph_
 | 
					    # set through token.morph_
 | 
				
			||||||
    doc[0].morph_ = "PronType=prs"
 | 
					    doc[0].set_morph("PronType=prs")
 | 
				
			||||||
    assert doc[0].morph_ == "PronType=prs"
 | 
					    assert str(doc[0].morph) == "PronType=prs"
 | 
				
			||||||
    assert doc.to_array(["MORPH"])[0] != 0
 | 
					    assert doc.to_array(["MORPH"])[0] != 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # unset with token.morph
 | 
					    # unset with token.morph
 | 
				
			||||||
    doc[0].morph = 0
 | 
					    doc[0].set_morph(0)
 | 
				
			||||||
    assert doc.to_array(["MORPH"])[0] == 0
 | 
					    assert doc.to_array(["MORPH"])[0] == 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # empty morph is equivalent to "_"
 | 
					    # empty morph is equivalent to "_"
 | 
				
			||||||
    doc[0].morph_ = ""
 | 
					    doc[0].set_morph("")
 | 
				
			||||||
    assert doc[0].morph_ == ""
 | 
					    assert str(doc[0].morph) == ""
 | 
				
			||||||
    assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
 | 
					    assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # "_" morph is also equivalent to empty morph
 | 
					    # "_" morph is also equivalent to empty morph
 | 
				
			||||||
    doc[0].morph_ = "_"
 | 
					    doc[0].set_morph("_")
 | 
				
			||||||
    assert doc[0].morph_ == ""
 | 
					    assert str(doc[0].morph) == ""
 | 
				
			||||||
    assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
 | 
					    assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # set through existing hash with token.morph
 | 
					    # set through existing hash with token.morph
 | 
				
			||||||
    tokenizer.vocab.strings.add("Feat=Val")
 | 
					    tokenizer.vocab.strings.add("Feat=Val")
 | 
				
			||||||
    doc[0].morph = tokenizer.vocab.strings.add("Feat=Val")
 | 
					    doc[0].set_morph(tokenizer.vocab.strings.add("Feat=Val"))
 | 
				
			||||||
    assert doc[0].morph_ == "Feat=Val"
 | 
					    assert str(doc[0].morph) == "Feat=Val"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -21,11 +21,11 @@ def test_doc_retokenize_merge(en_tokenizer):
 | 
				
			||||||
    assert doc[4].text == "the beach boys"
 | 
					    assert doc[4].text == "the beach boys"
 | 
				
			||||||
    assert doc[4].text_with_ws == "the beach boys "
 | 
					    assert doc[4].text_with_ws == "the beach boys "
 | 
				
			||||||
    assert doc[4].tag_ == "NAMED"
 | 
					    assert doc[4].tag_ == "NAMED"
 | 
				
			||||||
    assert doc[4].morph_ == "Number=Plur"
 | 
					    assert str(doc[4].morph) == "Number=Plur"
 | 
				
			||||||
    assert doc[5].text == "all night"
 | 
					    assert doc[5].text == "all night"
 | 
				
			||||||
    assert doc[5].text_with_ws == "all night"
 | 
					    assert doc[5].text_with_ws == "all night"
 | 
				
			||||||
    assert doc[5].tag_ == "NAMED"
 | 
					    assert doc[5].tag_ == "NAMED"
 | 
				
			||||||
    assert doc[5].morph_ == "Number=Plur"
 | 
					    assert str(doc[5].morph) == "Number=Plur"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_doc_retokenize_merge_children(en_tokenizer):
 | 
					def test_doc_retokenize_merge_children(en_tokenizer):
 | 
				
			||||||
| 
						 | 
					@ -201,6 +201,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
 | 
				
			||||||
    heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
 | 
					    heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
 | 
				
			||||||
    tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
 | 
					    tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
 | 
				
			||||||
    ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
 | 
					    ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
 | 
				
			||||||
 | 
					    ents = ["O"] * len(heads)
 | 
				
			||||||
 | 
					    ents[0] = "B-PERSON"
 | 
				
			||||||
 | 
					    ents[1] = "I-PERSON"
 | 
				
			||||||
 | 
					    ents[10] = "B-GPE"
 | 
				
			||||||
 | 
					    ents[13] = "B-PERSON"
 | 
				
			||||||
 | 
					    ents[14] = "I-PERSON"
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
    tokens = en_tokenizer(text)
 | 
					    tokens = en_tokenizer(text)
 | 
				
			||||||
    doc = Doc(
 | 
					    doc = Doc(
 | 
				
			||||||
| 
						 | 
					@ -269,7 +275,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
 | 
				
			||||||
    # if there is a parse, span.root provides default values
 | 
					    # if there is a parse, span.root provides default values
 | 
				
			||||||
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
 | 
					    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
 | 
				
			||||||
    heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
 | 
					    heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
 | 
				
			||||||
    ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)]
 | 
					    ents = ["O"] * len(words)
 | 
				
			||||||
 | 
					    ents[3] = "B-ent-de"
 | 
				
			||||||
 | 
					    ents[4] = "I-ent-de"
 | 
				
			||||||
 | 
					    ents[5] = "B-ent-fg"
 | 
				
			||||||
 | 
					    ents[6] = "I-ent-fg"
 | 
				
			||||||
    deps = ["dep"] * len(words)
 | 
					    deps = ["dep"] * len(words)
 | 
				
			||||||
    en_vocab.strings.add("ent-de")
 | 
					    en_vocab.strings.add("ent-de")
 | 
				
			||||||
    en_vocab.strings.add("ent-fg")
 | 
					    en_vocab.strings.add("ent-fg")
 | 
				
			||||||
| 
						 | 
					@ -292,7 +302,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
 | 
				
			||||||
    # check that B is preserved if span[start] is B
 | 
					    # check that B is preserved if span[start] is B
 | 
				
			||||||
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
 | 
					    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
 | 
				
			||||||
    heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
 | 
					    heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
 | 
				
			||||||
    ents = [("ent-de", 3, 5), ("ent-de", 5, 7)]
 | 
					    ents = ["O"] * len(words)
 | 
				
			||||||
 | 
					    ents[3] = "B-ent-de"
 | 
				
			||||||
 | 
					    ents[4] = "I-ent-de"
 | 
				
			||||||
 | 
					    ents[5] = "B-ent-de"
 | 
				
			||||||
 | 
					    ents[6] = "I-ent-de"
 | 
				
			||||||
    deps = ["dep"] * len(words)
 | 
					    deps = ["dep"] * len(words)
 | 
				
			||||||
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
 | 
					    doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
 | 
				
			||||||
    with doc.retokenize() as retokenizer:
 | 
					    with doc.retokenize() as retokenizer:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -27,11 +27,11 @@ def test_doc_retokenize_split(en_vocab):
 | 
				
			||||||
    assert doc[0].text == "Los"
 | 
					    assert doc[0].text == "Los"
 | 
				
			||||||
    assert doc[0].head.text == "Angeles"
 | 
					    assert doc[0].head.text == "Angeles"
 | 
				
			||||||
    assert doc[0].idx == 0
 | 
					    assert doc[0].idx == 0
 | 
				
			||||||
    assert doc[0].morph_ == "Number=Sing"
 | 
					    assert str(doc[0].morph) == "Number=Sing"
 | 
				
			||||||
    assert doc[1].idx == 3
 | 
					    assert doc[1].idx == 3
 | 
				
			||||||
    assert doc[1].text == "Angeles"
 | 
					    assert doc[1].text == "Angeles"
 | 
				
			||||||
    assert doc[1].head.text == "start"
 | 
					    assert doc[1].head.text == "start"
 | 
				
			||||||
    assert doc[1].morph_ == "Number=Sing"
 | 
					    assert str(doc[1].morph) == "Number=Sing"
 | 
				
			||||||
    assert doc[2].text == "start"
 | 
					    assert doc[2].text == "start"
 | 
				
			||||||
    assert doc[2].head.text == "."
 | 
					    assert doc[2].head.text == "."
 | 
				
			||||||
    assert doc[3].text == "."
 | 
					    assert doc[3].text == "."
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,7 +9,7 @@ def doc(en_vocab):
 | 
				
			||||||
    tags = ["VBP", "NN", "NN"]
 | 
					    tags = ["VBP", "NN", "NN"]
 | 
				
			||||||
    heads = [0, 0, 0]
 | 
					    heads = [0, 0, 0]
 | 
				
			||||||
    deps = ["ROOT", "dobj", "dobj"]
 | 
					    deps = ["ROOT", "dobj", "dobj"]
 | 
				
			||||||
    ents = [("ORG", 1, 2)]
 | 
					    ents = ["O", "B-ORG", "O"]
 | 
				
			||||||
    return Doc(
 | 
					    return Doc(
 | 
				
			||||||
        en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
 | 
					        en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -236,13 +236,13 @@ def test_matcher_subset_value_operator(en_vocab):
 | 
				
			||||||
    matcher.add("M", [pattern])
 | 
					    matcher.add("M", [pattern])
 | 
				
			||||||
    doc = Doc(en_vocab, words=["a", "b", "c"])
 | 
					    doc = Doc(en_vocab, words=["a", "b", "c"])
 | 
				
			||||||
    assert len(matcher(doc)) == 3
 | 
					    assert len(matcher(doc)) == 3
 | 
				
			||||||
    doc[0].morph_ = "Feat=Val"
 | 
					    doc[0].set_morph("Feat=Val")
 | 
				
			||||||
    assert len(matcher(doc)) == 3
 | 
					    assert len(matcher(doc)) == 3
 | 
				
			||||||
    doc[0].morph_ = "Feat=Val|Feat2=Val2"
 | 
					    doc[0].set_morph("Feat=Val|Feat2=Val2")
 | 
				
			||||||
    assert len(matcher(doc)) == 3
 | 
					    assert len(matcher(doc)) == 3
 | 
				
			||||||
    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
 | 
					    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
 | 
				
			||||||
    assert len(matcher(doc)) == 2
 | 
					    assert len(matcher(doc)) == 2
 | 
				
			||||||
    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
 | 
					    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
 | 
				
			||||||
    assert len(matcher(doc)) == 2
 | 
					    assert len(matcher(doc)) == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # IS_SUBSET acts like "IN" for attrs other than MORPH
 | 
					    # IS_SUBSET acts like "IN" for attrs other than MORPH
 | 
				
			||||||
| 
						 | 
					@ -268,11 +268,11 @@ def test_matcher_superset_value_operator(en_vocab):
 | 
				
			||||||
    matcher.add("M", [pattern])
 | 
					    matcher.add("M", [pattern])
 | 
				
			||||||
    doc = Doc(en_vocab, words=["a", "b", "c"])
 | 
					    doc = Doc(en_vocab, words=["a", "b", "c"])
 | 
				
			||||||
    assert len(matcher(doc)) == 0
 | 
					    assert len(matcher(doc)) == 0
 | 
				
			||||||
    doc[0].morph_ = "Feat=Val|Feat2=Val2"
 | 
					    doc[0].set_morph("Feat=Val|Feat2=Val2")
 | 
				
			||||||
    assert len(matcher(doc)) == 0
 | 
					    assert len(matcher(doc)) == 0
 | 
				
			||||||
    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
 | 
					    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
 | 
				
			||||||
    assert len(matcher(doc)) == 1
 | 
					    assert len(matcher(doc)) == 1
 | 
				
			||||||
    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
 | 
					    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
 | 
				
			||||||
    assert len(matcher(doc)) == 1
 | 
					    assert len(matcher(doc)) == 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # IS_SUPERSET with more than one value only matches for MORPH
 | 
					    # IS_SUPERSET with more than one value only matches for MORPH
 | 
				
			||||||
| 
						 | 
					@ -310,9 +310,9 @@ def test_matcher_morph_handling(en_vocab):
 | 
				
			||||||
    doc = Doc(en_vocab, words=["a", "b", "c"])
 | 
					    doc = Doc(en_vocab, words=["a", "b", "c"])
 | 
				
			||||||
    assert len(matcher(doc)) == 0
 | 
					    assert len(matcher(doc)) == 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    doc[0].morph_ = "Feat2=Val2|Feat1=Val1"
 | 
					    doc[0].set_morph("Feat2=Val2|Feat1=Val1")
 | 
				
			||||||
    assert len(matcher(doc)) == 2
 | 
					    assert len(matcher(doc)) == 2
 | 
				
			||||||
    doc[0].morph_ = "Feat1=Val1|Feat2=Val2"
 | 
					    doc[0].set_morph("Feat1=Val1|Feat2=Val2")
 | 
				
			||||||
    assert len(matcher(doc)) == 2
 | 
					    assert len(matcher(doc)) == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # multiple values are split
 | 
					    # multiple values are split
 | 
				
			||||||
| 
						 | 
					@ -324,9 +324,9 @@ def test_matcher_morph_handling(en_vocab):
 | 
				
			||||||
    doc = Doc(en_vocab, words=["a", "b", "c"])
 | 
					    doc = Doc(en_vocab, words=["a", "b", "c"])
 | 
				
			||||||
    assert len(matcher(doc)) == 0
 | 
					    assert len(matcher(doc)) == 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1"
 | 
					    doc[0].set_morph("Feat2=Val2,Val3|Feat1=Val1")
 | 
				
			||||||
    assert len(matcher(doc)) == 1
 | 
					    assert len(matcher(doc)) == 1
 | 
				
			||||||
    doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2"
 | 
					    doc[0].set_morph("Feat1=Val1,Val3|Feat2=Val2")
 | 
				
			||||||
    assert len(matcher(doc)) == 2
 | 
					    assert len(matcher(doc)) == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab):
 | 
				
			||||||
    doc2 = Doc(en_vocab, words=["Test"])
 | 
					    doc2 = Doc(en_vocab, words=["Test"])
 | 
				
			||||||
    doc2[0].tag_ = "TAG"
 | 
					    doc2[0].tag_ = "TAG"
 | 
				
			||||||
    doc2[0].pos_ = "X"
 | 
					    doc2[0].pos_ = "X"
 | 
				
			||||||
    doc2[0].morph_ = "Feat=Val"
 | 
					    doc2[0].set_morph("Feat=Val")
 | 
				
			||||||
    doc2[0].lemma_ = "LEMMA"
 | 
					    doc2[0].lemma_ = "LEMMA"
 | 
				
			||||||
    doc3 = Doc(en_vocab, words=["Test"])
 | 
					    doc3 = Doc(en_vocab, words=["Test"])
 | 
				
			||||||
    # DEP requires DEP
 | 
					    # DEP requires DEP
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -190,7 +190,7 @@ def test_phrase_matcher_validation(en_vocab):
 | 
				
			||||||
    doc2 = Doc(en_vocab, words=["Test"])
 | 
					    doc2 = Doc(en_vocab, words=["Test"])
 | 
				
			||||||
    doc2[0].tag_ = "TAG"
 | 
					    doc2[0].tag_ = "TAG"
 | 
				
			||||||
    doc2[0].pos_ = "X"
 | 
					    doc2[0].pos_ = "X"
 | 
				
			||||||
    doc2[0].morph_ = "Feat=Val"
 | 
					    doc2[0].set_morph("Feat=Val")
 | 
				
			||||||
    doc3 = Doc(en_vocab, words=["Test"])
 | 
					    doc3 = Doc(en_vocab, words=["Test"])
 | 
				
			||||||
    matcher = PhraseMatcher(en_vocab, validate=True)
 | 
					    matcher = PhraseMatcher(en_vocab, validate=True)
 | 
				
			||||||
    with pytest.warns(UserWarning):
 | 
					    with pytest.warns(UserWarning):
 | 
				
			||||||
| 
						 | 
					@ -217,7 +217,7 @@ def test_attr_pipeline_checks(en_vocab):
 | 
				
			||||||
    doc2 = Doc(en_vocab, words=["Test"])
 | 
					    doc2 = Doc(en_vocab, words=["Test"])
 | 
				
			||||||
    doc2[0].tag_ = "TAG"
 | 
					    doc2[0].tag_ = "TAG"
 | 
				
			||||||
    doc2[0].pos_ = "X"
 | 
					    doc2[0].pos_ = "X"
 | 
				
			||||||
    doc2[0].morph_ = "Feat=Val"
 | 
					    doc2[0].set_morph("Feat=Val")
 | 
				
			||||||
    doc2[0].lemma_ = "LEMMA"
 | 
					    doc2[0].lemma_ = "LEMMA"
 | 
				
			||||||
    doc3 = Doc(en_vocab, words=["Test"])
 | 
					    doc3 = Doc(en_vocab, words=["Test"])
 | 
				
			||||||
    # DEP requires DEP
 | 
					    # DEP requires DEP
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -339,7 +339,6 @@ def test_ner_warns_no_lookups(caplog):
 | 
				
			||||||
    nlp.vocab.lookups = Lookups()
 | 
					    nlp.vocab.lookups = Lookups()
 | 
				
			||||||
    assert not len(nlp.vocab.lookups)
 | 
					    assert not len(nlp.vocab.lookups)
 | 
				
			||||||
    nlp.add_pipe("ner")
 | 
					    nlp.add_pipe("ner")
 | 
				
			||||||
    nlp.config["initialize"]["lookups"] = None
 | 
					 | 
				
			||||||
    with caplog.at_level(logging.DEBUG):
 | 
					    with caplog.at_level(logging.DEBUG):
 | 
				
			||||||
        nlp.initialize()
 | 
					        nlp.initialize()
 | 
				
			||||||
        assert "W033" in caplog.text
 | 
					        assert "W033" in caplog.text
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -69,9 +69,9 @@ def test_attributeruler_init(nlp, pattern_dicts):
 | 
				
			||||||
        a.add(**p)
 | 
					        a.add(**p)
 | 
				
			||||||
    doc = nlp("This is a test.")
 | 
					    doc = nlp("This is a test.")
 | 
				
			||||||
    assert doc[2].lemma_ == "the"
 | 
					    assert doc[2].lemma_ == "the"
 | 
				
			||||||
    assert doc[2].morph_ == "Case=Nom|Number=Plur"
 | 
					    assert str(doc[2].morph) == "Case=Nom|Number=Plur"
 | 
				
			||||||
    assert doc[3].lemma_ == "cat"
 | 
					    assert doc[3].lemma_ == "cat"
 | 
				
			||||||
    assert doc[3].morph_ == "Case=Nom|Number=Sing"
 | 
					    assert str(doc[3].morph) == "Case=Nom|Number=Sing"
 | 
				
			||||||
    assert doc.has_annotation("LEMMA")
 | 
					    assert doc.has_annotation("LEMMA")
 | 
				
			||||||
    assert doc.has_annotation("MORPH")
 | 
					    assert doc.has_annotation("MORPH")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -81,9 +81,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
 | 
				
			||||||
    nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
 | 
					    nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
 | 
				
			||||||
    doc = nlp("This is a test.")
 | 
					    doc = nlp("This is a test.")
 | 
				
			||||||
    assert doc[2].lemma_ == "the"
 | 
					    assert doc[2].lemma_ == "the"
 | 
				
			||||||
    assert doc[2].morph_ == "Case=Nom|Number=Plur"
 | 
					    assert str(doc[2].morph) == "Case=Nom|Number=Plur"
 | 
				
			||||||
    assert doc[3].lemma_ == "cat"
 | 
					    assert doc[3].lemma_ == "cat"
 | 
				
			||||||
    assert doc[3].morph_ == "Case=Nom|Number=Sing"
 | 
					    assert str(doc[3].morph) == "Case=Nom|Number=Sing"
 | 
				
			||||||
    assert doc.has_annotation("LEMMA")
 | 
					    assert doc.has_annotation("LEMMA")
 | 
				
			||||||
    assert doc.has_annotation("MORPH")
 | 
					    assert doc.has_annotation("MORPH")
 | 
				
			||||||
    nlp.remove_pipe("attribute_ruler")
 | 
					    nlp.remove_pipe("attribute_ruler")
 | 
				
			||||||
| 
						 | 
					@ -94,9 +94,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    doc = nlp("This is a test.")
 | 
					    doc = nlp("This is a test.")
 | 
				
			||||||
    assert doc[2].lemma_ == "the"
 | 
					    assert doc[2].lemma_ == "the"
 | 
				
			||||||
    assert doc[2].morph_ == "Case=Nom|Number=Plur"
 | 
					    assert str(doc[2].morph) == "Case=Nom|Number=Plur"
 | 
				
			||||||
    assert doc[3].lemma_ == "cat"
 | 
					    assert doc[3].lemma_ == "cat"
 | 
				
			||||||
    assert doc[3].morph_ == "Case=Nom|Number=Sing"
 | 
					    assert str(doc[3].morph) == "Case=Nom|Number=Sing"
 | 
				
			||||||
    assert doc.has_annotation("LEMMA")
 | 
					    assert doc.has_annotation("LEMMA")
 | 
				
			||||||
    assert doc.has_annotation("MORPH")
 | 
					    assert doc.has_annotation("MORPH")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -106,9 +106,9 @@ def test_attributeruler_score(nlp, pattern_dicts):
 | 
				
			||||||
    nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
 | 
					    nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
 | 
				
			||||||
    doc = nlp("This is a test.")
 | 
					    doc = nlp("This is a test.")
 | 
				
			||||||
    assert doc[2].lemma_ == "the"
 | 
					    assert doc[2].lemma_ == "the"
 | 
				
			||||||
    assert doc[2].morph_ == "Case=Nom|Number=Plur"
 | 
					    assert str(doc[2].morph) == "Case=Nom|Number=Plur"
 | 
				
			||||||
    assert doc[3].lemma_ == "cat"
 | 
					    assert doc[3].lemma_ == "cat"
 | 
				
			||||||
    assert doc[3].morph_ == "Case=Nom|Number=Sing"
 | 
					    assert str(doc[3].morph) == "Case=Nom|Number=Sing"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    dev_examples = [
 | 
					    dev_examples = [
 | 
				
			||||||
        Example.from_dict(
 | 
					        Example.from_dict(
 | 
				
			||||||
| 
						 | 
					@ -150,10 +150,10 @@ def test_attributeruler_tag_map(nlp, tag_map):
 | 
				
			||||||
    for i in range(len(doc)):
 | 
					    for i in range(len(doc)):
 | 
				
			||||||
        if i == 4:
 | 
					        if i == 4:
 | 
				
			||||||
            assert doc[i].pos_ == "PUNCT"
 | 
					            assert doc[i].pos_ == "PUNCT"
 | 
				
			||||||
            assert doc[i].morph_ == "PunctType=peri"
 | 
					            assert str(doc[i].morph) == "PunctType=peri"
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            assert doc[i].pos_ == ""
 | 
					            assert doc[i].pos_ == ""
 | 
				
			||||||
            assert doc[i].morph_ == ""
 | 
					            assert str(doc[i].morph) == ""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_attributeruler_morph_rules(nlp, morph_rules):
 | 
					def test_attributeruler_morph_rules(nlp, morph_rules):
 | 
				
			||||||
| 
						 | 
					@ -168,11 +168,11 @@ def test_attributeruler_morph_rules(nlp, morph_rules):
 | 
				
			||||||
    for i in range(len(doc)):
 | 
					    for i in range(len(doc)):
 | 
				
			||||||
        if i != 2:
 | 
					        if i != 2:
 | 
				
			||||||
            assert doc[i].pos_ == ""
 | 
					            assert doc[i].pos_ == ""
 | 
				
			||||||
            assert doc[i].morph_ == ""
 | 
					            assert str(doc[i].morph) == ""
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            assert doc[2].pos_ == "DET"
 | 
					            assert doc[2].pos_ == "DET"
 | 
				
			||||||
            assert doc[2].lemma_ == "a"
 | 
					            assert doc[2].lemma_ == "a"
 | 
				
			||||||
            assert doc[2].morph_ == "Case=Nom"
 | 
					            assert str(doc[2].morph) == "Case=Nom"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_attributeruler_indices(nlp):
 | 
					def test_attributeruler_indices(nlp):
 | 
				
			||||||
| 
						 | 
					@ -194,14 +194,14 @@ def test_attributeruler_indices(nlp):
 | 
				
			||||||
    for i in range(len(doc)):
 | 
					    for i in range(len(doc)):
 | 
				
			||||||
        if i == 1:
 | 
					        if i == 1:
 | 
				
			||||||
            assert doc[i].lemma_ == "was"
 | 
					            assert doc[i].lemma_ == "was"
 | 
				
			||||||
            assert doc[i].morph_ == "Case=Nom|Number=Sing"
 | 
					            assert str(doc[i].morph) == "Case=Nom|Number=Sing"
 | 
				
			||||||
        elif i == 2:
 | 
					        elif i == 2:
 | 
				
			||||||
            assert doc[i].lemma_ == "the"
 | 
					            assert doc[i].lemma_ == "the"
 | 
				
			||||||
            assert doc[i].morph_ == "Case=Nom|Number=Plur"
 | 
					            assert str(doc[i].morph) == "Case=Nom|Number=Plur"
 | 
				
			||||||
        elif i == 3:
 | 
					        elif i == 3:
 | 
				
			||||||
            assert doc[i].lemma_ == "cat"
 | 
					            assert doc[i].lemma_ == "cat"
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            assert doc[i].morph_ == ""
 | 
					            assert str(doc[i].morph) == ""
 | 
				
			||||||
    # raises an error when trying to modify a token outside of the match
 | 
					    # raises an error when trying to modify a token outside of the match
 | 
				
			||||||
    a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
 | 
					    a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
 | 
				
			||||||
    with pytest.raises(ValueError):
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -91,7 +91,7 @@ def test_overfitting_IO():
 | 
				
			||||||
    doc = nlp(test_text)
 | 
					    doc = nlp(test_text)
 | 
				
			||||||
    gold_morphs = ["Feat=N", "Feat=V", "", ""]
 | 
					    gold_morphs = ["Feat=N", "Feat=V", "", ""]
 | 
				
			||||||
    gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
 | 
					    gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
 | 
				
			||||||
    assert [t.morph_ for t in doc] == gold_morphs
 | 
					    assert [str(t.morph) for t in doc] == gold_morphs
 | 
				
			||||||
    assert [t.pos_ for t in doc] == gold_pos_tags
 | 
					    assert [t.pos_ for t in doc] == gold_pos_tags
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Also test the results are still the same after IO
 | 
					    # Also test the results are still the same after IO
 | 
				
			||||||
| 
						 | 
					@ -99,5 +99,5 @@ def test_overfitting_IO():
 | 
				
			||||||
        nlp.to_disk(tmp_dir)
 | 
					        nlp.to_disk(tmp_dir)
 | 
				
			||||||
        nlp2 = util.load_model_from_path(tmp_dir)
 | 
					        nlp2 = util.load_model_from_path(tmp_dir)
 | 
				
			||||||
        doc2 = nlp2(test_text)
 | 
					        doc2 = nlp2(test_text)
 | 
				
			||||||
        assert [t.morph_ for t in doc2] == gold_morphs
 | 
					        assert [str(t.morph) for t in doc2] == gold_morphs
 | 
				
			||||||
        assert [t.pos_ for t in doc2] == gold_pos_tags
 | 
					        assert [t.pos_ for t in doc2] == gold_pos_tags
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -59,7 +59,7 @@ def test_issue3012(en_vocab):
 | 
				
			||||||
    words = ["This", "is", "10", "%", "."]
 | 
					    words = ["This", "is", "10", "%", "."]
 | 
				
			||||||
    tags = ["DT", "VBZ", "CD", "NN", "."]
 | 
					    tags = ["DT", "VBZ", "CD", "NN", "."]
 | 
				
			||||||
    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
 | 
					    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
 | 
				
			||||||
    ents = [("PERCENT", 2, 4)]
 | 
					    ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
 | 
				
			||||||
    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
 | 
					    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
 | 
				
			||||||
    assert doc.has_annotation("TAG")
 | 
					    assert doc.has_annotation("TAG")
 | 
				
			||||||
    expected = ("10", "NUM", "CD", "PERCENT")
 | 
					    expected = ("10", "NUM", "CD", "PERCENT")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -76,7 +76,7 @@ def tagged_doc():
 | 
				
			||||||
    for i in range(len(tags)):
 | 
					    for i in range(len(tags)):
 | 
				
			||||||
        doc[i].tag_ = tags[i]
 | 
					        doc[i].tag_ = tags[i]
 | 
				
			||||||
        doc[i].pos_ = pos[i]
 | 
					        doc[i].pos_ = pos[i]
 | 
				
			||||||
        doc[i].morph_ = morphs[i]
 | 
					        doc[i].set_morph(morphs[i])
 | 
				
			||||||
        if i > 0:
 | 
					        if i > 0:
 | 
				
			||||||
            doc[i].is_sent_start = False
 | 
					            doc[i].is_sent_start = False
 | 
				
			||||||
    return doc
 | 
					    return doc
 | 
				
			||||||
| 
						 | 
					@ -184,7 +184,7 @@ def test_ner_per_type(en_vocab):
 | 
				
			||||||
        doc = Doc(
 | 
					        doc = Doc(
 | 
				
			||||||
            en_vocab,
 | 
					            en_vocab,
 | 
				
			||||||
            words=input_.split(" "),
 | 
					            words=input_.split(" "),
 | 
				
			||||||
            ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
 | 
					            ents=["B-CARDINAL", "O", "B-CARDINAL"],
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        entities = offsets_to_biluo_tags(doc, annot["entities"])
 | 
					        entities = offsets_to_biluo_tags(doc, annot["entities"])
 | 
				
			||||||
        example = Example.from_dict(doc, {"entities": entities})
 | 
					        example = Example.from_dict(doc, {"entities": entities})
 | 
				
			||||||
| 
						 | 
					@ -209,7 +209,7 @@ def test_ner_per_type(en_vocab):
 | 
				
			||||||
        doc = Doc(
 | 
					        doc = Doc(
 | 
				
			||||||
            en_vocab,
 | 
					            en_vocab,
 | 
				
			||||||
            words=input_.split(" "),
 | 
					            words=input_.split(" "),
 | 
				
			||||||
            ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
 | 
					            ents=["B-ORG", "O", "O", "O", "O", "B-GPE", "B-ORG", "O", "O", "O"],
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        entities = offsets_to_biluo_tags(doc, annot["entities"])
 | 
					        entities = offsets_to_biluo_tags(doc, annot["entities"])
 | 
				
			||||||
        example = Example.from_dict(doc, {"entities": entities})
 | 
					        example = Example.from_dict(doc, {"entities": entities})
 | 
				
			||||||
| 
						 | 
					@ -242,7 +242,7 @@ def test_tag_score(tagged_doc):
 | 
				
			||||||
    gold = {
 | 
					    gold = {
 | 
				
			||||||
        "tags": [t.tag_ for t in tagged_doc],
 | 
					        "tags": [t.tag_ for t in tagged_doc],
 | 
				
			||||||
        "pos": [t.pos_ for t in tagged_doc],
 | 
					        "pos": [t.pos_ for t in tagged_doc],
 | 
				
			||||||
        "morphs": [t.morph_ for t in tagged_doc],
 | 
					        "morphs": [str(t.morph) for t in tagged_doc],
 | 
				
			||||||
        "sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc],
 | 
					        "sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc],
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    example = Example.from_dict(tagged_doc, gold)
 | 
					    example = Example.from_dict(tagged_doc, gold)
 | 
				
			||||||
| 
						 | 
					@ -259,7 +259,7 @@ def test_tag_score(tagged_doc):
 | 
				
			||||||
    tags[0] = "NN"
 | 
					    tags[0] = "NN"
 | 
				
			||||||
    pos = [t.pos_ for t in tagged_doc]
 | 
					    pos = [t.pos_ for t in tagged_doc]
 | 
				
			||||||
    pos[1] = "X"
 | 
					    pos[1] = "X"
 | 
				
			||||||
    morphs = [t.morph_ for t in tagged_doc]
 | 
					    morphs = [str(t.morph) for t in tagged_doc]
 | 
				
			||||||
    morphs[1] = "Number=sing"
 | 
					    morphs[1] = "Number=sing"
 | 
				
			||||||
    morphs[2] = "Number=plur"
 | 
					    morphs[2] = "Number=plur"
 | 
				
			||||||
    gold = {
 | 
					    gold = {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -113,7 +113,7 @@ def test_Example_from_dict_with_morphology(annots):
 | 
				
			||||||
    predicted = Doc(vocab, words=annots["words"])
 | 
					    predicted = Doc(vocab, words=annots["words"])
 | 
				
			||||||
    example = Example.from_dict(predicted, annots)
 | 
					    example = Example.from_dict(predicted, annots)
 | 
				
			||||||
    for i, token in enumerate(example.reference):
 | 
					    for i, token in enumerate(example.reference):
 | 
				
			||||||
        assert token.morph_ == annots["morphs"][i]
 | 
					        assert str(token.morph) == annots["morphs"][i]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -30,7 +30,12 @@ def doc(en_vocab):
 | 
				
			||||||
    heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
 | 
					    heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
 | 
				
			||||||
    deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
 | 
					    deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
 | 
				
			||||||
    lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
 | 
					    lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
 | 
				
			||||||
    ents = (("PERSON", 0, 2), ("LOC", 5, 7), ("GPE", 8, 9))
 | 
					    ents = ["O"] * len(words)
 | 
				
			||||||
 | 
					    ents[0] = "B-PERSON"
 | 
				
			||||||
 | 
					    ents[1] = "I-PERSON"
 | 
				
			||||||
 | 
					    ents[5] = "B-LOC"
 | 
				
			||||||
 | 
					    ents[6] = "I-LOC"
 | 
				
			||||||
 | 
					    ents[8] = "B-GPE"
 | 
				
			||||||
    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
 | 
					    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
    doc = Doc(
 | 
					    doc = Doc(
 | 
				
			||||||
| 
						 | 
					@ -455,7 +460,7 @@ def test_roundtrip_docs_to_docbin(doc):
 | 
				
			||||||
    idx = [t.idx for t in doc]
 | 
					    idx = [t.idx for t in doc]
 | 
				
			||||||
    tags = [t.tag_ for t in doc]
 | 
					    tags = [t.tag_ for t in doc]
 | 
				
			||||||
    pos = [t.pos_ for t in doc]
 | 
					    pos = [t.pos_ for t in doc]
 | 
				
			||||||
    morphs = [t.morph_ for t in doc]
 | 
					    morphs = [str(t.morph) for t in doc]
 | 
				
			||||||
    lemmas = [t.lemma_ for t in doc]
 | 
					    lemmas = [t.lemma_ for t in doc]
 | 
				
			||||||
    deps = [t.dep_ for t in doc]
 | 
					    deps = [t.dep_ for t in doc]
 | 
				
			||||||
    heads = [t.head.i for t in doc]
 | 
					    heads = [t.head.i for t in doc]
 | 
				
			||||||
| 
						 | 
					@ -477,7 +482,7 @@ def test_roundtrip_docs_to_docbin(doc):
 | 
				
			||||||
    assert idx == [t.idx for t in reloaded_example.reference]
 | 
					    assert idx == [t.idx for t in reloaded_example.reference]
 | 
				
			||||||
    assert tags == [t.tag_ for t in reloaded_example.reference]
 | 
					    assert tags == [t.tag_ for t in reloaded_example.reference]
 | 
				
			||||||
    assert pos == [t.pos_ for t in reloaded_example.reference]
 | 
					    assert pos == [t.pos_ for t in reloaded_example.reference]
 | 
				
			||||||
    assert morphs == [t.morph_ for t in reloaded_example.reference]
 | 
					    assert morphs == [str(t.morph) for t in reloaded_example.reference]
 | 
				
			||||||
    assert lemmas == [t.lemma_ for t in reloaded_example.reference]
 | 
					    assert lemmas == [t.lemma_ for t in reloaded_example.reference]
 | 
				
			||||||
    assert deps == [t.dep_ for t in reloaded_example.reference]
 | 
					    assert deps == [t.dep_ for t in reloaded_example.reference]
 | 
				
			||||||
    assert heads == [t.head.i for t in reloaded_example.reference]
 | 
					    assert heads == [t.head.i for t in reloaded_example.reference]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -101,7 +101,7 @@ class DocBin:
 | 
				
			||||||
            self.strings.add(token.text)
 | 
					            self.strings.add(token.text)
 | 
				
			||||||
            self.strings.add(token.tag_)
 | 
					            self.strings.add(token.tag_)
 | 
				
			||||||
            self.strings.add(token.lemma_)
 | 
					            self.strings.add(token.lemma_)
 | 
				
			||||||
            self.strings.add(token.morph_)
 | 
					            self.strings.add(str(token.morph))
 | 
				
			||||||
            self.strings.add(token.dep_)
 | 
					            self.strings.add(token.dep_)
 | 
				
			||||||
            self.strings.add(token.ent_type_)
 | 
					            self.strings.add(token.ent_type_)
 | 
				
			||||||
            self.strings.add(token.ent_kb_id_)
 | 
					            self.strings.add(token.ent_kb_id_)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -213,8 +213,9 @@ cdef class Doc:
 | 
				
			||||||
        sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
 | 
					        sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
 | 
				
			||||||
            the same length as words, to assign as token.is_sent_start. Will be
 | 
					            the same length as words, to assign as token.is_sent_start. Will be
 | 
				
			||||||
            overridden by heads if heads is provided. Defaults to None.
 | 
					            overridden by heads if heads is provided. Defaults to None.
 | 
				
			||||||
        ents (Optional[List[Tuple[Union[str, int], int, int]]]): A list of
 | 
					        ents (Optional[List[str]]): A list of unicode strings, of the same
 | 
				
			||||||
            (label, start, end) tuples to assign as doc.ents. Defaults to None.
 | 
					            length as words, as IOB tags to assign as token.ent_iob and
 | 
				
			||||||
 | 
					            token.ent_type. Defaults to None.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://nightly.spacy.io/api/doc#init
 | 
					        DOCS: https://nightly.spacy.io/api/doc#init
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
| 
						 | 
					@ -275,16 +276,55 @@ cdef class Doc:
 | 
				
			||||||
                    sent_starts[i] = -1
 | 
					                    sent_starts[i] = -1
 | 
				
			||||||
                elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
 | 
					                elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
 | 
				
			||||||
                    sent_starts[i] = 0
 | 
					                    sent_starts[i] = 0
 | 
				
			||||||
 | 
					        ent_iobs = None
 | 
				
			||||||
 | 
					        ent_types = None
 | 
				
			||||||
 | 
					        if ents is not None:
 | 
				
			||||||
 | 
					            iob_strings = Token.iob_strings()
 | 
				
			||||||
 | 
					            # make valid IOB2 out of IOB1 or IOB2
 | 
				
			||||||
 | 
					            for i, ent in enumerate(ents):
 | 
				
			||||||
 | 
					                if ent is "":
 | 
				
			||||||
 | 
					                    ents[i] = None
 | 
				
			||||||
 | 
					                elif ent is not None and not isinstance(ent, str):
 | 
				
			||||||
 | 
					                    raise ValueError(Errors.E177.format(tag=ent))
 | 
				
			||||||
 | 
					                if i < len(ents) - 1:
 | 
				
			||||||
 | 
					                    # OI -> OB
 | 
				
			||||||
 | 
					                    if (ent is None or ent.startswith("O")) and \
 | 
				
			||||||
 | 
					                            (ents[i+1] is not None and ents[i+1].startswith("I")):
 | 
				
			||||||
 | 
					                        ents[i+1] = "B" + ents[i+1][1:]
 | 
				
			||||||
 | 
					                    # B-TYPE1 I-TYPE2 or I-TYPE1 I-TYPE2 -> B/I-TYPE1 B-TYPE2
 | 
				
			||||||
 | 
					                    if ent is not None and ents[i+1] is not None and \
 | 
				
			||||||
 | 
					                            (ent.startswith("B") or ent.startswith("I")) and \
 | 
				
			||||||
 | 
					                            ents[i+1].startswith("I") and \
 | 
				
			||||||
 | 
					                            ent[1:] != ents[i+1][1:]:
 | 
				
			||||||
 | 
					                        ents[i+1] = "B" + ents[i+1][1:]
 | 
				
			||||||
 | 
					            ent_iobs = []
 | 
				
			||||||
 | 
					            ent_types = []
 | 
				
			||||||
 | 
					            for ent in ents:
 | 
				
			||||||
 | 
					                if ent is None:
 | 
				
			||||||
 | 
					                    ent_iobs.append(iob_strings.index(""))
 | 
				
			||||||
 | 
					                    ent_types.append("")
 | 
				
			||||||
 | 
					                elif ent == "O":
 | 
				
			||||||
 | 
					                    ent_iobs.append(iob_strings.index(ent))
 | 
				
			||||||
 | 
					                    ent_types.append("")
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    if len(ent) < 3 or ent[1] != "-":
 | 
				
			||||||
 | 
					                        raise ValueError(Errors.E177.format(tag=ent))
 | 
				
			||||||
 | 
					                    ent_iob, ent_type = ent.split("-", 1) 
 | 
				
			||||||
 | 
					                    if ent_iob not in iob_strings:
 | 
				
			||||||
 | 
					                        raise ValueError(Errors.E177.format(tag=ent))
 | 
				
			||||||
 | 
					                    ent_iob = iob_strings.index(ent_iob)
 | 
				
			||||||
 | 
					                    ent_iobs.append(ent_iob)
 | 
				
			||||||
 | 
					                    ent_types.append(ent_type)
 | 
				
			||||||
        headings = []
 | 
					        headings = []
 | 
				
			||||||
        values = []
 | 
					        values = []
 | 
				
			||||||
        annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts]
 | 
					        annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts, ent_iobs, ent_types]
 | 
				
			||||||
        possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START]
 | 
					        possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START, ENT_IOB, ENT_TYPE]
 | 
				
			||||||
        for a, annot in enumerate(annotations):
 | 
					        for a, annot in enumerate(annotations):
 | 
				
			||||||
            if annot is not None:
 | 
					            if annot is not None:
 | 
				
			||||||
                if len(annot) != len(words):
 | 
					                if len(annot) != len(words):
 | 
				
			||||||
                    raise ValueError(Errors.E189)
 | 
					                    raise ValueError(Errors.E189)
 | 
				
			||||||
                headings.append(possible_headings[a])
 | 
					                headings.append(possible_headings[a])
 | 
				
			||||||
                if annot is not heads and annot is not sent_starts:
 | 
					                if annot is not heads and annot is not sent_starts and annot is not ent_iobs:
 | 
				
			||||||
                    values.extend(annot)
 | 
					                    values.extend(annot)
 | 
				
			||||||
        for value in values:
 | 
					        for value in values:
 | 
				
			||||||
            self.vocab.strings.add(value)
 | 
					            self.vocab.strings.add(value)
 | 
				
			||||||
| 
						 | 
					@ -296,7 +336,7 @@ cdef class Doc:
 | 
				
			||||||
            j = 0
 | 
					            j = 0
 | 
				
			||||||
            for annot in annotations:
 | 
					            for annot in annotations:
 | 
				
			||||||
                if annot:
 | 
					                if annot:
 | 
				
			||||||
                    if annot is heads or annot is sent_starts:
 | 
					                    if annot is heads or annot is sent_starts or annot is ent_iobs:
 | 
				
			||||||
                        for i in range(len(words)):
 | 
					                        for i in range(len(words)):
 | 
				
			||||||
                            if attrs.ndim == 1:
 | 
					                            if attrs.ndim == 1:
 | 
				
			||||||
                                attrs[i] = annot[i]
 | 
					                                attrs[i] = annot[i]
 | 
				
			||||||
| 
						 | 
					@ -317,8 +357,6 @@ cdef class Doc:
 | 
				
			||||||
                                attrs[i, j] = self.vocab.strings[annot[i]]
 | 
					                                attrs[i, j] = self.vocab.strings[annot[i]]
 | 
				
			||||||
                    j += 1
 | 
					                    j += 1
 | 
				
			||||||
            self.from_array(headings, attrs)
 | 
					            self.from_array(headings, attrs)
 | 
				
			||||||
        if ents is not None:
 | 
					 | 
				
			||||||
            self.ents = ents
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def _(self):
 | 
					    def _(self):
 | 
				
			||||||
| 
						 | 
					@ -1210,7 +1248,7 @@ cdef class Doc:
 | 
				
			||||||
        for token in self:
 | 
					        for token in self:
 | 
				
			||||||
            strings.add(token.tag_)
 | 
					            strings.add(token.tag_)
 | 
				
			||||||
            strings.add(token.lemma_)
 | 
					            strings.add(token.lemma_)
 | 
				
			||||||
            strings.add(token.morph_)
 | 
					            strings.add(str(token.morph))
 | 
				
			||||||
            strings.add(token.dep_)
 | 
					            strings.add(token.dep_)
 | 
				
			||||||
            strings.add(token.ent_type_)
 | 
					            strings.add(token.ent_type_)
 | 
				
			||||||
            strings.add(token.ent_kb_id_)
 | 
					            strings.add(token.ent_kb_id_)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -215,20 +215,20 @@ cdef class Token:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return MorphAnalysis.from_id(self.vocab, self.c.morph)
 | 
					            return MorphAnalysis.from_id(self.vocab, self.c.morph)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def __set__(self, attr_t morph):
 | 
					        def __set__(self, MorphAnalysis morph):
 | 
				
			||||||
            if morph == 0:
 | 
					            # Check that the morph has the same vocab
 | 
				
			||||||
                self.c.morph = morph
 | 
					            if self.vocab != morph.vocab:
 | 
				
			||||||
            elif morph in self.vocab.strings:
 | 
					                raise ValueError(Errors.E1013)
 | 
				
			||||||
                self.morph_ = self.vocab.strings[morph]
 | 
					            self.c.morph = morph.c.key
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def set_morph(self, features):
 | 
				
			||||||
 | 
					        cdef hash_t key
 | 
				
			||||||
 | 
					        if features is 0:
 | 
				
			||||||
 | 
					            self.c.morph = 0
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
                raise ValueError(Errors.E1009.format(val=morph))
 | 
					            if isinstance(features, int):
 | 
				
			||||||
 | 
					                features = self.vocab.strings[features]
 | 
				
			||||||
    property morph_:
 | 
					            key = self.vocab.morphology.add(features)
 | 
				
			||||||
        def __get__(self):
 | 
					 | 
				
			||||||
            return str(MorphAnalysis.from_id(self.vocab, self.c.morph))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        def __set__(self, features):
 | 
					 | 
				
			||||||
            cdef hash_t key = self.vocab.morphology.add(features)
 | 
					 | 
				
			||||||
            self.c.morph = key
 | 
					            self.c.morph = key
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -207,6 +207,7 @@ def conllu_sentence_to_doc(
 | 
				
			||||||
        pos=poses,
 | 
					        pos=poses,
 | 
				
			||||||
        deps=deps,
 | 
					        deps=deps,
 | 
				
			||||||
        lemmas=lemmas,
 | 
					        lemmas=lemmas,
 | 
				
			||||||
 | 
					        morphs=morphs,
 | 
				
			||||||
        heads=heads,
 | 
					        heads=heads,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    for i in range(len(doc)):
 | 
					    for i in range(len(doc)):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -46,7 +46,7 @@ def create_jsonl_reader(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@util.registry.readers("spacy.read_labels.v1")
 | 
					@util.registry.readers("spacy.read_labels.v1")
 | 
				
			||||||
def read_labels(path: Path, *, require: bool=False):
 | 
					def read_labels(path: Path, *, require: bool = False):
 | 
				
			||||||
    # I decided not to give this a generic name, because I don't want people to
 | 
					    # I decided not to give this a generic name, because I don't want people to
 | 
				
			||||||
    # use it for arbitrary stuff, as I want this require arg with default False.
 | 
					    # use it for arbitrary stuff, as I want this require arg with default False.
 | 
				
			||||||
    if not require and not path.exists():
 | 
					    if not require and not path.exists():
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
from collections import Iterable as IterableInstance
 | 
					from collections.abc import Iterable as IterableInstance
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
from murmurhash.mrmr cimport hash64
 | 
					from murmurhash.mrmr cimport hash64
 | 
				
			||||||
| 
						 | 
					@ -226,7 +226,7 @@ cdef class Example:
 | 
				
			||||||
                "TAG": [t.tag_ for t in self.reference],
 | 
					                "TAG": [t.tag_ for t in self.reference],
 | 
				
			||||||
                "LEMMA": [t.lemma_ for t in self.reference],
 | 
					                "LEMMA": [t.lemma_ for t in self.reference],
 | 
				
			||||||
                "POS": [t.pos_ for t in self.reference],
 | 
					                "POS": [t.pos_ for t in self.reference],
 | 
				
			||||||
                "MORPH": [t.morph_ for t in self.reference],
 | 
					                "MORPH": [str(t.morph) for t in self.reference],
 | 
				
			||||||
                "HEAD": [t.head.i for t in self.reference],
 | 
					                "HEAD": [t.head.i for t in self.reference],
 | 
				
			||||||
                "DEP": [t.dep_ for t in self.reference],
 | 
					                "DEP": [t.dep_ for t in self.reference],
 | 
				
			||||||
                "SENT_START": [int(bool(t.is_sent_start)) for t in self.reference]
 | 
					                "SENT_START": [int(bool(t.is_sent_start)) for t in self.reference]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -44,7 +44,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
 | 
				
			||||||
                if include_annotation["POS"]:
 | 
					                if include_annotation["POS"]:
 | 
				
			||||||
                    json_token["pos"] = token.pos_
 | 
					                    json_token["pos"] = token.pos_
 | 
				
			||||||
                if include_annotation["MORPH"]:
 | 
					                if include_annotation["MORPH"]:
 | 
				
			||||||
                    json_token["morph"] = token.morph_
 | 
					                    json_token["morph"] = str(token.morph)
 | 
				
			||||||
                if include_annotation["LEMMA"]:
 | 
					                if include_annotation["LEMMA"]:
 | 
				
			||||||
                    json_token["lemma"] = token.lemma_
 | 
					                    json_token["lemma"] = token.lemma_
 | 
				
			||||||
                if include_annotation["DEP"]:
 | 
					                if include_annotation["DEP"]:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -144,9 +144,9 @@ argument that connects to the shared `tok2vec` component in the pipeline.
 | 
				
			||||||
Construct an embedding layer that separately embeds a number of lexical
 | 
					Construct an embedding layer that separately embeds a number of lexical
 | 
				
			||||||
attributes using hash embedding, concatenates the results, and passes it through
 | 
					attributes using hash embedding, concatenates the results, and passes it through
 | 
				
			||||||
a feed-forward subnetwork to build mixed representations. The features used are
 | 
					a feed-forward subnetwork to build mixed representations. The features used are
 | 
				
			||||||
the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying definitions
 | 
					the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a
 | 
				
			||||||
depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained
 | 
					[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static
 | 
				
			||||||
static vectors can also be incorporated into the concatenated representation.
 | 
					vectors can also be incorporated into the concatenated representation.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name                      | Description                                                                                                                                                                                                       |
 | 
					| Name                      | Description                                                                                                                                                                                                       |
 | 
				
			||||||
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| 
						 | 
					@ -291,6 +291,24 @@ on [static vectors](/usage/embeddings-transformers#static-vectors) for details.
 | 
				
			||||||
| `key_attr`  | Defaults to `"ORTH"`. ~~str~~                                                                                                                                                                                           |
 | 
					| `key_attr`  | Defaults to `"ORTH"`. ~~str~~                                                                                                                                                                                           |
 | 
				
			||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~                                                                                                                                                          |
 | 
					| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~                                                                                                                                                          |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy.FeatureExtractor.v1 {#FeatureExtractor}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example config
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					> [model]
 | 
				
			||||||
 | 
					> @architectures = "spacy.FeatureExtractor.v1"
 | 
				
			||||||
 | 
					> columns = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list
 | 
				
			||||||
 | 
					of feature names to extract, which should refer to token attributes.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        |  Description                                                             |
 | 
				
			||||||
 | 
					| ----------- | ------------------------------------------------------------------------ |
 | 
				
			||||||
 | 
					| `columns`   | The token attributes to extract. ~~List[Union[int, str]]~~               |
 | 
				
			||||||
 | 
					| **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
 | 
					## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The following architectures are provided by the package
 | 
					The following architectures are provided by the package
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -186,15 +186,14 @@ This functionality was previously available as part of the command `init-model`.
 | 
				
			||||||
</Infobox>
 | 
					</Infobox>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```cli
 | 
					```cli
 | 
				
			||||||
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--lexemes-jsonl] [--verbose]
 | 
					$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose]
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name               | Description                                                                                                                                                                                                                                                         |
 | 
					| Name               | Description                                                                                                                                                                                                                                                         |
 | 
				
			||||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `lang`             | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~                                                                                                                                                |
 | 
					| `lang`             | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~                                                                                                                                                |
 | 
				
			||||||
| `vectors_loc`      | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
 | 
					| `vectors_loc`      | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
 | 
				
			||||||
| `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
 | 
					| `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
 | 
				
			||||||
| `--lexemes-jsonl`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~                                                                                                                         |
 | 
					 | 
				
			||||||
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
 | 
					| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
 | 
				
			||||||
| `--prune`, `-p`    | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~                                                                                                                                                                     |
 | 
					| `--prune`, `-p`    | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~                                                                                                                                                                     |
 | 
				
			||||||
| `--name`, `-n`     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~                                                                                                                                                   |
 | 
					| `--name`, `-n`     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~                                                                                                                                                   |
 | 
				
			||||||
| 
						 | 
					@ -202,6 +201,39 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
 | 
				
			||||||
| `--help`, `-h`     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                          |
 | 
					| `--help`, `-h`     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                          |
 | 
				
			||||||
| **CREATES**        | A spaCy pipeline directory containing the vocab and vectors.                                                                                                                                                                                                        |
 | 
					| **CREATES**        | A spaCy pipeline directory containing the vocab and vectors.                                                                                                                                                                                                        |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### init labels {#init-labels new="3" tag="command"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Generate JSON files for the labels in the data. This helps speed up the training
 | 
				
			||||||
 | 
					process, since spaCy won't have to preprocess the data to extract the labels.
 | 
				
			||||||
 | 
					After generating the labels, you can provide them to components that accept a
 | 
				
			||||||
 | 
					`labels` argument on initialization via the
 | 
				
			||||||
 | 
					[`[initialize]`](/api/data-formats#config-initialize) block of your config.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example config
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					> [initialize.components.ner]
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> [initialize.components.ner.labels]
 | 
				
			||||||
 | 
					> @readers = "spacy.read_labels.v1"
 | 
				
			||||||
 | 
					> path = "corpus/labels/ner.json
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```cli
 | 
				
			||||||
 | 
					$ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [--gpu-id] [overrides]
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name              | Description                                                                                                                                                                                |
 | 
				
			||||||
 | 
					| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
 | 
					| `config_path`     | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~                                                                |
 | 
				
			||||||
 | 
					| `output_path`     | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~                                                                                       |
 | 
				
			||||||
 | 
					| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~       |
 | 
				
			||||||
 | 
					| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                               |
 | 
				
			||||||
 | 
					| `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                 |
 | 
				
			||||||
 | 
					| `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                 |
 | 
				
			||||||
 | 
					| overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
 | 
				
			||||||
 | 
					| **CREATES**       | The final trained pipeline and the best trained pipeline.                                                                                                                                  |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## convert {#convert tag="command"}
 | 
					## convert {#convert tag="command"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Convert files into spaCy's
 | 
					Convert files into spaCy's
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -238,8 +238,6 @@ without requiring them at runtime when you load the trained pipeline back in.
 | 
				
			||||||
> data_path = "/path/to/component_data"
 | 
					> data_path = "/path/to/component_data"
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<!-- TODO: -->
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                    |
 | 
					| Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                    |
 | 
				
			||||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `components`   | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~                                                                      |
 | 
					| `components`   | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~                                                                      |
 | 
				
			||||||
| 
						 | 
					@ -454,15 +452,20 @@ example = Example.from_dict(doc, gold_dict)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Lexical data for vocabulary {#vocab-jsonl new="2"}
 | 
					## Lexical data for vocabulary {#vocab-jsonl new="2"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To populate a pipeline's vocabulary, you can use the
 | 
					This data file can be provided via the `vocab_data` setting in the
 | 
				
			||||||
[`spacy init vectors`](/api/cli#init-vectors) command and load in a
 | 
					`[initialize]` block of the training config to pre-define the lexical data to
 | 
				
			||||||
[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one
 | 
					initialize the `nlp` object's vocabulary with. The file should contain one
 | 
				
			||||||
lexical entry per line via the `--jsonl-loc` option. The first line defines the
 | 
					lexical entry per line. The first line defines the language and vocabulary
 | 
				
			||||||
language and vocabulary settings. All other lines are expected to be JSON
 | 
					settings. All other lines are expected to be JSON objects describing an
 | 
				
			||||||
objects describing an individual lexeme. The lexical attributes will be then set
 | 
					individual lexeme. The lexical attributes will be then set as attributes on
 | 
				
			||||||
as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab`
 | 
					spaCy's [`Lexeme`](/api/lexeme#attributes) object.
 | 
				
			||||||
command outputs a ready-to-use spaCy pipeline with a `Vocab` containing the
 | 
					
 | 
				
			||||||
lexical data.
 | 
					> #### Example config
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					> [initialize]
 | 
				
			||||||
 | 
					> vocab_data = "/path/to/vocab-data.jsonl"
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
### First line
 | 
					### First line
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -21,8 +21,9 @@ non-projective parses.
 | 
				
			||||||
The parser is trained using an **imitation learning objective**. It follows the
 | 
					The parser is trained using an **imitation learning objective**. It follows the
 | 
				
			||||||
actions predicted by the current weights, and at each state, determines which
 | 
					actions predicted by the current weights, and at each state, determines which
 | 
				
			||||||
actions are compatible with the optimal parse that could be reached from the
 | 
					actions are compatible with the optimal parse that could be reached from the
 | 
				
			||||||
current state. The weights are updated such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note
 | 
					current state. The weights are updated such that the scores assigned to the set
 | 
				
			||||||
that more than one action may be optimal for a given state.
 | 
					of optimal actions is increased, while scores assigned to other actions are
 | 
				
			||||||
 | 
					decreased. Note that more than one action may be optimal for a given state.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Config and implementation {#config}
 | 
					## Config and implementation {#config}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -139,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
 | 
				
			||||||
| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | 
					| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | 
				
			||||||
| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 | 
					| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## DependencyParser.initialize {#initialize tag="method"}
 | 
					## DependencyParser.initialize {#initialize tag="method" new="3"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Initialize the component for training. `get_examples` should be a function that
 | 
					Initialize the component for training. `get_examples` should be a function that
 | 
				
			||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
 | 
					returns an iterable of [`Example`](/api/example) objects. The data examples are
 | 
				
			||||||
| 
						 | 
					@ -148,7 +149,10 @@ training data or a representative sample. Initialization includes validating the
 | 
				
			||||||
network,
 | 
					network,
 | 
				
			||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 | 
					[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 | 
				
			||||||
setting up the label scheme based on the data. This method is typically called
 | 
					setting up the label scheme based on the data. This method is typically called
 | 
				
			||||||
by [`Language.initialize`](/api/language#initialize).
 | 
					by [`Language.initialize`](/api/language#initialize) and lets you customize
 | 
				
			||||||
 | 
					arguments it receives via the
 | 
				
			||||||
 | 
					[`[initialize.components]`](/api/data-formats#config-initialize) block in the
 | 
				
			||||||
 | 
					config.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 | 
					<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -162,12 +166,22 @@ This method was previously called `begin_training`.
 | 
				
			||||||
> parser = nlp.add_pipe("parser")
 | 
					> parser = nlp.add_pipe("parser")
 | 
				
			||||||
> parser.initialize(lambda: [], nlp=nlp)
 | 
					> parser.initialize(lambda: [], nlp=nlp)
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					> ### config.cfg
 | 
				
			||||||
 | 
					> [initialize.components.parser]
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> [initialize.components.parser.labels]
 | 
				
			||||||
 | 
					> @readers = "spacy.read_labels.v1"
 | 
				
			||||||
 | 
					> path = "corpus/labels/parser.json
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name           | Description                                                                                                                                                                                                                                                                                                         |
 | 
					| Name           | Description                                                                                                                                                                                                                                                                                                         |
 | 
				
			||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
 | 
					| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
 | 
				
			||||||
| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
 | 
					| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
 | 
				
			||||||
| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
 | 
					| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
 | 
				
			||||||
 | 
					| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## DependencyParser.predict {#predict tag="method"}
 | 
					## DependencyParser.predict {#predict tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -32,7 +32,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name                                     | Description                                                                                                                                                                                        |
 | 
					| Name                                     | Description                                                                                                                                                                                        |
 | 
				
			||||||
| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `vocab`                                  | A storage container for lexical types. ~~Vocab~~                                                                                                                                                   |
 | 
					| `vocab`                                  | A storage container for lexical types. ~~Vocab~~                                                                                                                                                   |
 | 
				
			||||||
| `words`                                  | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                                 |
 | 
					| `words`                                  | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                                 |
 | 
				
			||||||
| `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~       |
 | 
					| `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~       |
 | 
				
			||||||
| 
						 | 
					@ -45,7 +45,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | 
				
			||||||
| `heads` <Tag variant="new">3</Tag>       | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
 | 
					| `heads` <Tag variant="new">3</Tag>       | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
 | 
				
			||||||
| `deps` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                              |
 | 
					| `deps` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                              |
 | 
				
			||||||
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~  |
 | 
					| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~  |
 | 
				
			||||||
| `ents` <Tag variant="new">3</Tag>        | A list of `(label, start, end)` tuples to assign as `doc.ents`. Note that the `start` and `end` indices here refer to the token indices. Defaults to `None`. ~~Optional[List[Tuple[Union[str, int], int, int]]]~~ |
 | 
					| `ents` <Tag variant="new">3</Tag>        | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~                                                                   |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
 | 
					## Doc.\_\_getitem\_\_ {#getitem tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -503,7 +503,9 @@ invalidated, although they may accidentally continue to work.
 | 
				
			||||||
Mark a span for merging. The `attrs` will be applied to the resulting token (if
 | 
					Mark a span for merging. The `attrs` will be applied to the resulting token (if
 | 
				
			||||||
they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
 | 
					they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
 | 
				
			||||||
underlying lexeme (if they're context-independent lexical attributes like
 | 
					underlying lexeme (if they're context-independent lexical attributes like
 | 
				
			||||||
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute names to values.
 | 
					`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided
 | 
				
			||||||
 | 
					using the `"_"` key and specifying a dictionary that maps attribute names to
 | 
				
			||||||
 | 
					values.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
> #### Example
 | 
					> #### Example
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
 | 
				
			||||||
| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | 
					| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | 
				
			||||||
| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 | 
					| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## EntityLinker.initialize {#initialize tag="method"}
 | 
					## EntityLinker.initialize {#initialize tag="method" new="3"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Initialize the component for training. `get_examples` should be a function that
 | 
					Initialize the component for training. `get_examples` should be a function that
 | 
				
			||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
 | 
					returns an iterable of [`Example`](/api/example) objects. The data examples are
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
 | 
				
			||||||
| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | 
					| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | 
				
			||||||
| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 | 
					| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## EntityRecognizer.initialize {#initialize tag="method"}
 | 
					## EntityRecognizer.initialize {#initialize tag="method" new="3"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Initialize the component for training. `get_examples` should be a function that
 | 
					Initialize the component for training. `get_examples` should be a function that
 | 
				
			||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
 | 
					returns an iterable of [`Example`](/api/example) objects. The data examples are
 | 
				
			||||||
| 
						 | 
					@ -138,7 +138,10 @@ training data or a representative sample. Initialization includes validating the
 | 
				
			||||||
network,
 | 
					network,
 | 
				
			||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 | 
					[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 | 
				
			||||||
setting up the label scheme based on the data. This method is typically called
 | 
					setting up the label scheme based on the data. This method is typically called
 | 
				
			||||||
by [`Language.initialize`](/api/language#initialize).
 | 
					by [`Language.initialize`](/api/language#initialize) and lets you customize
 | 
				
			||||||
 | 
					arguments it receives via the
 | 
				
			||||||
 | 
					[`[initialize.components]`](/api/data-formats#config-initialize) block in the
 | 
				
			||||||
 | 
					config.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 | 
					<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -152,12 +155,22 @@ This method was previously called `begin_training`.
 | 
				
			||||||
> ner = nlp.add_pipe("ner")
 | 
					> ner = nlp.add_pipe("ner")
 | 
				
			||||||
> ner.initialize(lambda: [], nlp=nlp)
 | 
					> ner.initialize(lambda: [], nlp=nlp)
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					> ### config.cfg
 | 
				
			||||||
 | 
					> [initialize.components.ner]
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> [initialize.components.ner.labels]
 | 
				
			||||||
 | 
					> @readers = "spacy.read_labels.v1"
 | 
				
			||||||
 | 
					> path = "corpus/labels/ner.json
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name           | Description                                                                                                                                                                                                                                                                                                         |
 | 
					| Name           | Description                                                                                                                                                                                                                                                                                                         |
 | 
				
			||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
 | 
					| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
 | 
				
			||||||
| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
 | 
					| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
 | 
				
			||||||
| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
 | 
					| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
 | 
				
			||||||
 | 
					| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## EntityRecognizer.predict {#predict tag="method"}
 | 
					## EntityRecognizer.predict {#predict tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -202,7 +202,7 @@ more efficient than processing texts one-by-one.
 | 
				
			||||||
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~                                                                                                               |
 | 
					| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~                                                                                                               |
 | 
				
			||||||
| **YIELDS**                                 | Documents in the order of the original text. ~~Doc~~                                                                                                                |
 | 
					| **YIELDS**                                 | Documents in the order of the original text. ~~Doc~~                                                                                                                |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Language.initialize {#initialize tag="method"}
 | 
					## Language.initialize {#initialize tag="method" new="3"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Initialize the pipeline for training and return an
 | 
					Initialize the pipeline for training and return an
 | 
				
			||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the
 | 
					[`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -126,7 +126,10 @@ training data or a representative sample. Initialization includes validating the
 | 
				
			||||||
network,
 | 
					network,
 | 
				
			||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 | 
					[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 | 
				
			||||||
setting up the label scheme based on the data. This method is typically called
 | 
					setting up the label scheme based on the data. This method is typically called
 | 
				
			||||||
by [`Language.initialize`](/api/language#initialize).
 | 
					by [`Language.initialize`](/api/language#initialize) and lets you customize
 | 
				
			||||||
 | 
					arguments it receives via the
 | 
				
			||||||
 | 
					[`[initialize.components]`](/api/data-formats#config-initialize) block in the
 | 
				
			||||||
 | 
					config.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
> #### Example
 | 
					> #### Example
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
| 
						 | 
					@ -134,12 +137,22 @@ by [`Language.initialize`](/api/language#initialize).
 | 
				
			||||||
> morphologizer = nlp.add_pipe("morphologizer")
 | 
					> morphologizer = nlp.add_pipe("morphologizer")
 | 
				
			||||||
> morphologizer.initialize(lambda: [], nlp=nlp)
 | 
					> morphologizer.initialize(lambda: [], nlp=nlp)
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					> ### config.cfg
 | 
				
			||||||
 | 
					> [initialize.components.morphologizer]
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> [initialize.components.morphologizer.labels]
 | 
				
			||||||
 | 
					> @readers = "spacy.read_labels.v1"
 | 
				
			||||||
 | 
					> path = "corpus/labels/morphologizer.json
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name           | Description                                                                                                                                                                                                                                                                                                         |
 | 
					| Name           | Description                                                                                                                                                                                                                                                                                                         |
 | 
				
			||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
 | 
					| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
 | 
				
			||||||
| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
 | 
					| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
 | 
				
			||||||
| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
 | 
					| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
 | 
				
			||||||
 | 
					| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Morphologizer.predict {#predict tag="method"}
 | 
					## Morphologizer.predict {#predict tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
 | 
				
			||||||
| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | 
					| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | 
				
			||||||
| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 | 
					| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Pipe.initialize {#initialize tag="method"}
 | 
					## Pipe.initialize {#initialize tag="method" new="3"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Initialize the component for training. `get_examples` should be a function that
 | 
					Initialize the component for training. `get_examples` should be a function that
 | 
				
			||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
 | 
					returns an iterable of [`Example`](/api/example) objects. The data examples are
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
 | 
				
			||||||
| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | 
					| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | 
				
			||||||
| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 | 
					| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Tagger.initialize {#initialize tag="method"}
 | 
					## Tagger.initialize {#initialize tag="method" new="3"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Initialize the component for training. `get_examples` should be a function that
 | 
					Initialize the component for training. `get_examples` should be a function that
 | 
				
			||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
 | 
					returns an iterable of [`Example`](/api/example) objects. The data examples are
 | 
				
			||||||
| 
						 | 
					@ -121,7 +121,10 @@ training data or a representative sample. Initialization includes validating the
 | 
				
			||||||
network,
 | 
					network,
 | 
				
			||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 | 
					[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 | 
				
			||||||
setting up the label scheme based on the data. This method is typically called
 | 
					setting up the label scheme based on the data. This method is typically called
 | 
				
			||||||
by [`Language.initialize`](/api/language#initialize).
 | 
					by [`Language.initialize`](/api/language#initialize) and lets you customize
 | 
				
			||||||
 | 
					arguments it receives via the
 | 
				
			||||||
 | 
					[`[initialize.components]`](/api/data-formats#config-initialize) block in the
 | 
				
			||||||
 | 
					config.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 | 
					<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -135,12 +138,22 @@ This method was previously called `begin_training`.
 | 
				
			||||||
> tagger = nlp.add_pipe("tagger")
 | 
					> tagger = nlp.add_pipe("tagger")
 | 
				
			||||||
> tagger.initialize(lambda: [], nlp=nlp)
 | 
					> tagger.initialize(lambda: [], nlp=nlp)
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					> ### config.cfg
 | 
				
			||||||
 | 
					> [initialize.components.tagger]
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> [initialize.components.tagger.labels]
 | 
				
			||||||
 | 
					> @readers = "spacy.read_labels.v1"
 | 
				
			||||||
 | 
					> path = "corpus/labels/tagger.json
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name           | Description                                                                                                                                                                                                                                                                                                         |
 | 
					| Name           | Description                                                                                                                                                                                                                                                                                                         |
 | 
				
			||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
 | 
					| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
 | 
				
			||||||
| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
 | 
					| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
 | 
				
			||||||
| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
 | 
					| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
 | 
				
			||||||
 | 
					| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[list]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Tagger.predict {#predict tag="method"}
 | 
					## Tagger.predict {#predict tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
 | 
				
			||||||
| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | 
					| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | 
				
			||||||
| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 | 
					| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## TextCategorizer.initialize {#initialize tag="method"}
 | 
					## TextCategorizer.initialize {#initialize tag="method" new="3"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Initialize the component for training. `get_examples` should be a function that
 | 
					Initialize the component for training. `get_examples` should be a function that
 | 
				
			||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
 | 
					returns an iterable of [`Example`](/api/example) objects. The data examples are
 | 
				
			||||||
| 
						 | 
					@ -134,7 +134,10 @@ training data or a representative sample. Initialization includes validating the
 | 
				
			||||||
network,
 | 
					network,
 | 
				
			||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 | 
					[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 | 
				
			||||||
setting up the label scheme based on the data. This method is typically called
 | 
					setting up the label scheme based on the data. This method is typically called
 | 
				
			||||||
by [`Language.initialize`](/api/language#initialize).
 | 
					by [`Language.initialize`](/api/language#initialize) and lets you customize
 | 
				
			||||||
 | 
					arguments it receives via the
 | 
				
			||||||
 | 
					[`[initialize.components]`](/api/data-formats#config-initialize) block in the
 | 
				
			||||||
 | 
					config.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 | 
					<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -148,12 +151,22 @@ This method was previously called `begin_training`.
 | 
				
			||||||
> textcat = nlp.add_pipe("textcat")
 | 
					> textcat = nlp.add_pipe("textcat")
 | 
				
			||||||
> textcat.initialize(lambda: [], nlp=nlp)
 | 
					> textcat.initialize(lambda: [], nlp=nlp)
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					> ### config.cfg
 | 
				
			||||||
 | 
					> [initialize.components.textcat]
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> [initialize.components.textcat.labels]
 | 
				
			||||||
 | 
					> @readers = "spacy.read_labels.v1"
 | 
				
			||||||
 | 
					> path = "corpus/labels/textcat.json
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name           | Description                                                                                                                                                                                                                                                                                                         |
 | 
					| Name           | Description                                                                                                                                                                                                                                                                                                         |
 | 
				
			||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
 | 
					| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
 | 
				
			||||||
| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
 | 
					| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
 | 
				
			||||||
| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
 | 
					| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
 | 
				
			||||||
 | 
					| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## TextCategorizer.predict {#predict tag="method"}
 | 
					## TextCategorizer.predict {#predict tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -538,6 +538,32 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
 | 
				
			||||||
| `limit`      | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                  |
 | 
					| `limit`      | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                  |
 | 
				
			||||||
| **CREATES**  | The corpus reader. ~~JsonlTexts~~                                                                                                |
 | 
					| **CREATES**  | The corpus reader. ~~JsonlTexts~~                                                                                                |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy.read_labels.v1 {#read_labels tag="registered function"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Read a JSON-formatted labels file generated with
 | 
				
			||||||
 | 
					[`init labels`](/api/cli#init-labels). Typically used in the
 | 
				
			||||||
 | 
					[`[initialize]`](/api/data-formats#config-initialize) block of the training
 | 
				
			||||||
 | 
					config to speed up the model initialization process and provide pre-generated
 | 
				
			||||||
 | 
					label sets.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example config
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					> [initialize.components]
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> [initialize.components.ner]
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> [initialize.components.ner.labels]
 | 
				
			||||||
 | 
					> @readers = "spacy.read_labels.v1"
 | 
				
			||||||
 | 
					> path = "corpus/labels/ner.json"
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                                                                                                                                                                                                               |
 | 
				
			||||||
 | 
					| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `path`      | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~                                                                                                                                |
 | 
				
			||||||
 | 
					| `require`   | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
 | 
				
			||||||
 | 
					| **CREATES** | The                                                                                                                                                                                                                       |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
 | 
					## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
A data batcher implements a batching strategy that essentially turns a stream of
 | 
					A data batcher implements a batching strategy that essentially turns a stream of
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -585,8 +585,9 @@ vectors, but combines them via summation with a smaller table of learned
 | 
				
			||||||
embeddings.
 | 
					embeddings.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
from thinc.api import add, chain, remap_ids, Embed, FeatureExtractor
 | 
					from thinc.api import add, chain, remap_ids, Embed
 | 
				
			||||||
from spacy.ml.staticvectors import StaticVectors
 | 
					from spacy.ml.staticvectors import StaticVectors
 | 
				
			||||||
 | 
					from spacy.ml.featureextractor import FeatureExtractor
 | 
				
			||||||
from spacy.util import registry
 | 
					from spacy.util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.architectures("my_example.MyEmbedding.v1")
 | 
					@registry.architectures("my_example.MyEmbedding.v1")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -204,7 +204,19 @@ initialize it.
 | 
				
			||||||
 | 
					
 | 
				
			||||||

 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<!-- TODO: explain lifecycle and initialization -->
 | 
					At runtime spaCy will only use the `[nlp]` and `[components]` blocks of the
 | 
				
			||||||
 | 
					config and load all data, including tokenization rules, model weights and other
 | 
				
			||||||
 | 
					resources from the pipeline directory. The `[training]` block contains the
 | 
				
			||||||
 | 
					settings for training the model and is only used during training. Similarly, the
 | 
				
			||||||
 | 
					`[initialize]` block defines how the initial `nlp` object should be set up
 | 
				
			||||||
 | 
					before training and whether it should be initialized with vectors or pretrained
 | 
				
			||||||
 | 
					tok2vec weights, or any other data needed by the components.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The initialization settings are only loaded and used when
 | 
				
			||||||
 | 
					[`nlp.initialize`](/api/language#initialize) is called (typically right before
 | 
				
			||||||
 | 
					training). This allows you to set up your pipeline using local data resources
 | 
				
			||||||
 | 
					and custom functions, and preserve the information in your config – but without
 | 
				
			||||||
 | 
					requiring it to be available at runtime
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Overwriting config settings on the command line {#config-overrides}
 | 
					### Overwriting config settings on the command line {#config-overrides}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -803,6 +815,10 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
 | 
				
			||||||
    return create_model(output_width)
 | 
					    return create_model(output_width)
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					<!-- TODO:
 | 
				
			||||||
 | 
					### Customizing the initialization {#initialization}
 | 
				
			||||||
 | 
					-->
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Data utilities {#data}
 | 
					## Data utilities {#data}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
spaCy includes various features and utilities to make it easy to train models
 | 
					spaCy includes various features and utilities to make it easy to train models
 | 
				
			||||||
| 
						 | 
					@ -853,7 +869,7 @@ nlp = spacy.blank("en")
 | 
				
			||||||
docbin = DocBin(nlp.vocab)
 | 
					docbin = DocBin(nlp.vocab)
 | 
				
			||||||
words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
 | 
					words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
 | 
				
			||||||
spaces = [True, True, True, True, True, True, True, False]
 | 
					spaces = [True, True, True, True, True, True, True, False]
 | 
				
			||||||
ents = [("ORG", 0, 1), ("GPE", 5, 6)]
 | 
					ents = ["B-ORG", "O", "O", "O", "O", "B-GPE", "O", "O"]
 | 
				
			||||||
doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
 | 
					doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
 | 
				
			||||||
docbin.add(doc)
 | 
					docbin.add(doc)
 | 
				
			||||||
docbin.to_disk("./train.spacy")
 | 
					docbin.to_disk("./train.spacy")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -104,7 +104,6 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
> ```ini
 | 
					> ```ini
 | 
				
			||||||
> [training]
 | 
					> [training]
 | 
				
			||||||
> vectors = null
 | 
					 | 
				
			||||||
> accumulate_gradient = 3
 | 
					> accumulate_gradient = 3
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
> [training.optimizer]
 | 
					> [training.optimizer]
 | 
				
			||||||
| 
						 | 
					@ -430,6 +429,8 @@ The following methods, attributes and commands are new in spaCy v3.0.
 | 
				
			||||||
| [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config)                        | Updated helpers for loading a pipeline's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config).                                                                     |
 | 
					| [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config)                        | Updated helpers for loading a pipeline's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config).                                                                     |
 | 
				
			||||||
| [`util.get_installed_models`](/api/top-level#util.get_installed_models)                                                         | Names of all pipeline packages installed in the environment.                                                                                                                                     |
 | 
					| [`util.get_installed_models`](/api/top-level#util.get_installed_models)                                                         | Names of all pipeline packages installed in the environment.                                                                                                                                     |
 | 
				
			||||||
| [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training).                                                                                                   |
 | 
					| [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training).                                                                                                   |
 | 
				
			||||||
 | 
					| [`init vectors`](/api/cli#init-vectors)                                                                                         | Convert word vectors for use with spaCy.                                                                                                                                                         |
 | 
				
			||||||
 | 
					| [`init labels`](/api/cli#init-labels)                                                                                           | Generate JSON files for the labels in the data to speed up training.                                                                                                                             |
 | 
				
			||||||
| [`project`](/api/cli#project)                                                                                                   | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects).                                                                                                       |
 | 
					| [`project`](/api/cli#project)                                                                                                   | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects).                                                                                                       |
 | 
				
			||||||
| [`ray`](/api/cli#ray)                                                                                                           | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package.                                |
 | 
					| [`ray`](/api/cli#ray)                                                                                                           | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package.                                |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,11 @@
 | 
				
			||||||
const autoprefixer = require('autoprefixer')
 | 
					const autoprefixer = require('autoprefixer')
 | 
				
			||||||
const path = require('path')
 | 
					const path = require('path')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// https://florian.ec/blog/gatsby-build-netlify-segmentation-fault/
 | 
				
			||||||
 | 
					const sharp = require('sharp')
 | 
				
			||||||
 | 
					sharp.cache(false)
 | 
				
			||||||
 | 
					sharp.simd(false)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Markdown plugins
 | 
					// Markdown plugins
 | 
				
			||||||
const wrapSectionPlugin = require('./src/plugins/remark-wrap-section.js')
 | 
					const wrapSectionPlugin = require('./src/plugins/remark-wrap-section.js')
 | 
				
			||||||
const customAttrsPlugin = require('./src/plugins/remark-custom-attrs.js')
 | 
					const customAttrsPlugin = require('./src/plugins/remark-custom-attrs.js')
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user