This commit is contained in:
Matthew Honnibal 2020-10-01 23:07:53 +02:00
commit 75a1569908
59 changed files with 576 additions and 342 deletions

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a26" __version__ = "3.0.0a28"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -7,6 +7,7 @@ import srsly
from .. import util from .. import util
from ..training.initialize import init_nlp, convert_vectors from ..training.initialize import init_nlp, convert_vectors
from ..language import Language
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu from ._util import import_code, setup_gpu
@ -19,9 +20,9 @@ def init_vectors_cli(
output_dir: Path = Arg(..., help="Pipeline output directory"), output_dir: Path = Arg(..., help="Pipeline output directory"),
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
jsonl_loc: Optional[Path]=Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file"),
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
# fmt: on # fmt: on
): ):
"""Convert word vectors for use with spaCy. Will export an nlp object that """Convert word vectors for use with spaCy. Will export an nlp object that
@ -32,12 +33,7 @@ def init_vectors_cli(
msg.info(f"Creating blank nlp object for language '{lang}'") msg.info(f"Creating blank nlp object for language '{lang}'")
nlp = util.get_lang_class(lang)() nlp = util.get_lang_class(lang)()
if jsonl_loc is not None: if jsonl_loc is not None:
lex_attrs = srsly.read_jsonl(jsonl_loc) update_lexemes(nlp, jsonl_loc)
for attrs in lex_attrs:
if "settings" in attrs:
continue
lexeme = nlp.vocab[attrs["orth"]]
lexeme.set_attrs(**attrs)
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name) convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
nlp.to_disk(output_dir) nlp.to_disk(output_dir)
@ -48,6 +44,16 @@ def init_vectors_cli(
) )
def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
# Mostly used for backwards-compatibility and may be removed in the future
lex_attrs = srsly.read_jsonl(jsonl_loc)
for attrs in lex_attrs:
if "settings" in attrs:
continue
lexeme = nlp.vocab[attrs["orth"]]
lexeme.set_attrs(**attrs)
@init_cli.command( @init_cli.command(
"nlp", "nlp",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
@ -89,7 +95,7 @@ def init_labels_cli(
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on # fmt: on
): ):
"""Generate a JSON file for labels in the data. This helps speed up the """Generate JSON files for the labels in the data. This helps speed up the
training process, since spaCy won't have to preprocess the data to training process, since spaCy won't have to preprocess the data to
extract the labels.""" extract the labels."""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)

View File

@ -2,7 +2,6 @@
train = null train = null
dev = null dev = null
vectors = null vectors = null
vocab_data = null
init_tok2vec = null init_tok2vec = null
[system] [system]
@ -11,8 +10,13 @@ gpu_allocator = null
[nlp] [nlp]
lang = null lang = null
# List of pipeline component names, in order. The names should correspond to
# components defined in the [components block]
pipeline = [] pipeline = []
# Components that are loaded but disabled by default
disabled = [] disabled = []
# Optional callbacks to modify the nlp object before it's initialized, after
# it's created and after the pipeline has been set up
before_creation = null before_creation = null
after_creation = null after_creation = null
after_pipeline_creation = null after_pipeline_creation = null
@ -20,6 +24,7 @@ after_pipeline_creation = null
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1" @tokenizers = "spacy.Tokenizer.v1"
# The pipeline components and their models
[components] [components]
# Readers for corpora like dev and train. # Readers for corpora like dev and train.
@ -38,8 +43,7 @@ max_length = 0
limit = 0 limit = 0
# Apply some simply data augmentation, where we replace tokens with variations. # Apply some simply data augmentation, where we replace tokens with variations.
# This is especially useful for punctuation and case replacement, to help # This is especially useful for punctuation and case replacement, to help
# generalize beyond corpora that don't have smart-quotes, or only have smart # generalize beyond corpora that don't/only have smart quotes etc.
# quotes, etc.
augmenter = null augmenter = null
[corpora.dev] [corpora.dev]
@ -53,6 +57,7 @@ gold_preproc = false
max_length = 0 max_length = 0
# Limitation on number of training examples # Limitation on number of training examples
limit = 0 limit = 0
# Optional callback for data augmentation
augmenter = null augmenter = null
# Training hyper-parameters and additional features. # Training hyper-parameters and additional features.
@ -102,17 +107,18 @@ use_averages = false
eps = 1e-8 eps = 1e-8
learn_rate = 0.001 learn_rate = 0.001
# The 'initialize' step is run before training or pretraining. Components and # These settings are used when nlp.initialize() is called (typically before
# the tokenizer can each define their own arguments via their .initialize # training or pretraining). Components and the tokenizer can each define their
# methods that are populated by the config. This lets them gather resources like # own arguments via their initialize methods that are populated by the config.
# lookup tables and build label sets, construct vocabularies, etc. # This lets them gather data resources, build label sets etc.
[initialize] [initialize]
vocab_data = ${paths.vocab_data}
lookups = null
vectors = ${paths.vectors} vectors = ${paths.vectors}
# Extra resources for transfer-learning or pseudo-rehearsal # Extra resources for transfer-learning or pseudo-rehearsal
init_tok2vec = ${paths.init_tok2vec} init_tok2vec = ${paths.init_tok2vec}
# Data and lookups for vocabulary
vocab_data = null
lookups = null
# Arguments passed to the tokenizer's initialize method # Arguments passed to the tokenizer's initialize method
tokenizer = {} tokenizer = {}
# Arguments passed to the initialize methods of the components (keyed by component name) # Arguments for initialize methods of the components (keyed by component)
components = {} components = {}

View File

@ -710,6 +710,9 @@ class Errors:
"options: {modes}") "options: {modes}")
E1012 = ("Entity spans and blocked/missing/outside spans should be " E1012 = ("Entity spans and blocked/missing/outside spans should be "
"provided to doc.set_ents as lists of `Span` objects.") "provided to doc.set_ents as lists of `Span` objects.")
E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
"token itself. To set the morph from this MorphAnalysis, set from "
"the string value with: `token.set_morph(str(other_morph))`.")
@add_codes @add_codes

View File

@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class DanishDefaults(Language.Defaults): class DanishDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class GermanDefaults(Language.Defaults): class GermanDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -9,21 +9,9 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
from .lemmatizer import GreekLemmatizer from .lemmatizer import GreekLemmatizer
from ...lookups import Lookups from ...lookups import Lookups
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class GreekDefaults(Language.Defaults): class GreekDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -4,21 +4,9 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class IndonesianDefaults(Language.Defaults): class IndonesianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class LuxembourgishDefaults(Language.Defaults): class LuxembourgishDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS

View File

@ -3,21 +3,9 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class PortugueseDefaults(Language.Defaults): class PortugueseDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES

View File

@ -7,21 +7,9 @@ from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer from .lemmatizer import RussianLemmatizer
from ...language import Language from ...language import Language
from ...lookups import Lookups from ...lookups import Lookups
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class RussianDefaults(Language.Defaults): class RussianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -2,21 +2,9 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class SerbianDefaults(Language.Defaults): class SerbianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,21 +1,9 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class TamilDefaults(Language.Defaults): class TamilDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -10,13 +10,6 @@ DEFAULT_CONFIG = """
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.th.ThaiTokenizer" @tokenizers = "spacy.th.ThaiTokenizer"
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
""" """

View File

@ -0,0 +1,25 @@
from typing import List, Union, Callable, Tuple
from thinc.types import Ints2d, Doc
from thinc.api import Model, registry
@registry.layers("spacy.FeatureExtractor.v1")
def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
return Model("extract_features", forward, attrs={"columns": columns})
def forward(model: Model[List[Doc], List[Ints2d]], docs, is_train: bool) -> Tuple[List[Ints2d], Callable]:
columns = model.attrs["columns"]
features: List[Ints2d] = []
for doc in docs:
if hasattr(doc, "to_array"):
attrs = doc.to_array(columns)
else:
attrs = doc.doc.to_array(columns)[doc.start : doc.end]
if attrs.ndim == 1:
attrs = attrs.reshape((attrs.shape[0], 1))
features.append(model.ops.asarray2i(attrs, dtype="uint64"))
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
return features, backprop

View File

@ -3,12 +3,13 @@ from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
from thinc.api import HashEmbed, with_array, with_cpu, uniqued from thinc.api import HashEmbed, with_array, with_cpu, uniqued
from thinc.api import Relu, residual, expand_window, FeatureExtractor from thinc.api import Relu, residual, expand_window
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
from ...util import registry from ...util import registry
from ..extract_ngrams import extract_ngrams from ..extract_ngrams import extract_ngrams
from ..staticvectors import StaticVectors from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor
@registry.architectures.register("spacy.TextCatCNN.v1") @registry.architectures.register("spacy.TextCatCNN.v1")

View File

@ -1,16 +1,16 @@
from typing import Optional, List, Union from typing import Optional, List, Union
from thinc.api import chain, clone, concatenate, with_array, with_padded
from thinc.api import Model, noop, list2ragged, ragged2list
from thinc.api import FeatureExtractor, HashEmbed
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
from thinc.types import Floats2d from thinc.types import Floats2d
from thinc.api import chain, clone, concatenate, with_array, with_padded
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
from ...tokens import Doc from ...tokens import Doc
from ...util import registry from ...util import registry
from ...ml import _character_embed from ...ml import _character_embed
from ..staticvectors import StaticVectors from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor
from ...pipeline.tok2vec import Tok2VecListener from ...pipeline.tok2vec import Tok2VecListener
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
@registry.architectures.register("spacy.Tok2VecListener.v1") @registry.architectures.register("spacy.Tok2VecListener.v1")
@ -98,7 +98,7 @@ def MultiHashEmbed(
attributes using hash embedding, concatenates the results, and passes it attributes using hash embedding, concatenates the results, and passes it
through a feed-forward subnetwork to build a mixed representations. through a feed-forward subnetwork to build a mixed representations.
The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
varying definitions depending on the Vocab of the Doc object passed in. varying definitions depending on the Vocab of the Doc object passed in.
Vectors from pretrained static vectors can also be incorporated into the Vectors from pretrained static vectors can also be incorporated into the
concatenated representation. concatenated representation.
@ -115,7 +115,7 @@ def MultiHashEmbed(
also_use_static_vectors (bool): Whether to also use static word vectors. also_use_static_vectors (bool): Whether to also use static word vectors.
Requires a vectors table to be loaded in the Doc objects' vocab. Requires a vectors table to be loaded in the Doc objects' vocab.
""" """
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH] cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
seed = 7 seed = 7
def make_hash_embed(feature): def make_hash_embed(feature):
@ -123,7 +123,7 @@ def MultiHashEmbed(
seed += 1 seed += 1
return HashEmbed( return HashEmbed(
width, width,
rows if feature == NORM else rows // 2, rows if feature == LOWER else rows // 2,
column=cols.index(feature), column=cols.index(feature),
seed=seed, seed=seed,
dropout=0.0, dropout=0.0,
@ -131,13 +131,13 @@ def MultiHashEmbed(
if also_embed_subwords: if also_embed_subwords:
embeddings = [ embeddings = [
make_hash_embed(NORM), make_hash_embed(LOWER),
make_hash_embed(PREFIX), make_hash_embed(PREFIX),
make_hash_embed(SUFFIX), make_hash_embed(SUFFIX),
make_hash_embed(SHAPE), make_hash_embed(SHAPE),
] ]
else: else:
embeddings = [make_hash_embed(NORM)] embeddings = [make_hash_embed(LOWER)]
concat_size = width * (len(embeddings) + also_use_static_vectors) concat_size = width * (len(embeddings) + also_use_static_vectors)
if also_use_static_vectors: if also_use_static_vectors:
model = chain( model = chain(
@ -180,13 +180,17 @@ def CharacterEmbed(
of being in an arbitrary position depending on the word length. of being in an arbitrary position depending on the word length.
The characters are embedded in a embedding table with a given number of rows, The characters are embedded in a embedding table with a given number of rows,
and the vectors concatenated. A hash-embedded vector of the NORM of the word is and the vectors concatenated. A hash-embedded vector of the LOWER of the word is
also concatenated on, and the result is then passed through a feed-forward also concatenated on, and the result is then passed through a feed-forward
network to construct a single vector to represent the information. network to construct a single vector to represent the information.
feature (int or str): An attribute to embed, to concatenate with the characters. feature (int or str): An attribute to embed, to concatenate with the characters.
width (int): The width of the output vector and the feature embedding. width (int): The width of the output vector and the feature embedding.
<<<<<<< HEAD
rows (int): The number of rows in the NORM hash embedding table. rows (int): The number of rows in the NORM hash embedding table.
=======
rows (int): The number of rows in the LOWER hash embedding table.
>>>>>>> 300e5a9928fd226dfddbf7d5c22558f696bfa1af
nM (int): The dimensionality of the character embeddings. Recommended values nM (int): The dimensionality of the character embeddings. Recommended values
are between 16 and 64. are between 16 and 64.
nC (int): The number of UTF-8 bytes to embed per word. Recommended values nC (int): The number of UTF-8 bytes to embed per word. Recommended values

View File

@ -149,7 +149,7 @@ class Morphologizer(Tagger):
for example in get_examples(): for example in get_examples():
for i, token in enumerate(example.reference): for i, token in enumerate(example.reference):
pos = token.pos_ pos = token.pos_
morph = token.morph_ morph = str(token.morph)
# create and add the combined morph+POS label # create and add the combined morph+POS label
morph_dict = Morphology.feats_to_dict(morph) morph_dict = Morphology.feats_to_dict(morph)
if pos: if pos:
@ -167,7 +167,7 @@ class Morphologizer(Tagger):
gold_array = [] gold_array = []
for i, token in enumerate(example.reference): for i, token in enumerate(example.reference):
pos = token.pos_ pos = token.pos_
morph = token.morph_ morph = str(token.morph)
morph_dict = Morphology.feats_to_dict(morph) morph_dict = Morphology.feats_to_dict(morph)
if pos: if pos:
morph_dict[self.POS_FEAT] = pos morph_dict[self.POS_FEAT] = pos

View File

@ -268,6 +268,9 @@ class Tagger(Pipe):
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.. returns a representative sample of gold-standard Example objects..
nlp (Language): The current nlp object the component is part of. nlp (Language): The current nlp object the component is part of.
labels: The labels to add to the component, typically generated by the
`init labels` command. If no labels are provided, the get_examples
callback is used to extract the labels from the data.
DOCS: https://nightly.spacy.io/api/tagger#initialize DOCS: https://nightly.spacy.io/api/tagger#initialize
""" """

View File

@ -355,6 +355,9 @@ class TextCategorizer(Pipe):
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects. returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of. nlp (Language): The current nlp object the component is part of.
labels: The labels to add to the component, typically generated by the
`init labels` command. If no labels are provided, the get_examples
callback is used to extract the labels from the data.
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
""" """

View File

@ -46,9 +46,9 @@ def test_doc_array_morph(en_vocab):
words = ["Eat", "blue", "ham"] words = ["Eat", "blue", "ham"]
morph = ["Feat=V", "Feat=J", "Feat=N"] morph = ["Feat=V", "Feat=J", "Feat=N"]
doc = Doc(en_vocab, words=words, morphs=morph) doc = Doc(en_vocab, words=words, morphs=morph)
assert morph[0] == doc[0].morph_ assert morph[0] == str(doc[0].morph)
assert morph[1] == doc[1].morph_ assert morph[1] == str(doc[1].morph)
assert morph[2] == doc[2].morph_ assert morph[2] == str(doc[2].morph)
feats_array = doc.to_array((ORTH, MORPH)) feats_array = doc.to_array((ORTH, MORPH))
assert feats_array[0][1] == doc[0].morph.key assert feats_array[0][1] == doc[0].morph.key

View File

@ -319,15 +319,13 @@ def test_doc_from_array_morph(en_vocab):
words = ["I", "live", "in", "New", "York", "."] words = ["I", "live", "in", "New", "York", "."]
morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"] morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
# fmt: on # fmt: on
doc = Doc(en_vocab, words=words) doc = Doc(en_vocab, words=words, morphs=morphs)
for i, morph in enumerate(morphs):
doc[i].morph_ = morph
attrs = [MORPH] attrs = [MORPH]
arr = doc.to_array(attrs) arr = doc.to_array(attrs)
new_doc = Doc(en_vocab, words=words) new_doc = Doc(en_vocab, words=words)
new_doc.from_array(attrs, arr) new_doc.from_array(attrs, arr)
assert [t.morph_ for t in new_doc] == morphs assert [str(t.morph) for t in new_doc] == morphs
assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc] assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc]
def test_doc_api_from_docs(en_tokenizer, de_tokenizer): def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
@ -423,7 +421,7 @@ def test_has_annotation(en_vocab):
doc[0].tag_ = "A" doc[0].tag_ = "A"
doc[0].pos_ = "X" doc[0].pos_ = "X"
doc[0].morph_ = "Feat=Val" doc[0].set_morph("Feat=Val")
doc[0].lemma_ = "a" doc[0].lemma_ = "a"
doc[0].dep_ = "dep" doc[0].dep_ = "dep"
doc[0].head = doc[1] doc[0].head = doc[1]
@ -435,7 +433,7 @@ def test_has_annotation(en_vocab):
doc[1].tag_ = "A" doc[1].tag_ = "A"
doc[1].pos_ = "X" doc[1].pos_ = "X"
doc[1].morph_ = "" doc[1].set_morph("")
doc[1].lemma_ = "a" doc[1].lemma_ = "a"
doc[1].dep_ = "dep" doc[1].dep_ = "dep"
doc.ents = [Span(doc, 0, 2, label="HELLO")] doc.ents = [Span(doc, 0, 2, label="HELLO")]
@ -533,5 +531,78 @@ def test_doc_ents_setter():
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"] assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
vocab = Vocab() vocab = Vocab()
ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)] ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"]
doc = Doc(vocab, words=words, ents=ents) doc = Doc(vocab, words=words, ents=ents)
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"] assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
def test_doc_morph_setter(en_tokenizer, de_tokenizer):
doc1 = en_tokenizer("a b")
doc1b = en_tokenizer("c d")
doc2 = de_tokenizer("a b")
# unset values can be copied
doc1[0].morph = doc1[1].morph
assert doc1[0].morph.key == 0
assert doc1[1].morph.key == 0
# morph values from the same vocab can be copied
doc1[0].set_morph("Feat=Val")
doc1[1].morph = doc1[0].morph
assert doc1[0].morph == doc1[1].morph
# ... also across docs
doc1b[0].morph = doc1[0].morph
assert doc1[0].morph == doc1b[0].morph
doc2[0].set_morph("Feat2=Val2")
# the morph value must come from the same vocab
with pytest.raises(ValueError):
doc1[0].morph = doc2[0].morph
def test_doc_init_iob():
"""Test ents validation/normalization in Doc.__init__"""
words = ["a", "b", "c", "d", "e"]
ents = ["O"] * len(words)
doc = Doc(Vocab(), words=words, ents=ents)
assert doc.ents == ()
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-PERSON"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 2
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 3
# None is missing
ents = ["B-PERSON", "I-PERSON", "O", None, "I-GPE"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 2
# empty tag is missing
ents = ["", "B-PERSON", "O", "B-PERSON", "I-PERSON"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 2
# invalid IOB
ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)
# no dash
ents = ["OPERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)
# no ent type
ents = ["O", "B-", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)
# not strings or None
ents = [0, "B-", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)

View File

@ -4,13 +4,13 @@ import pytest
@pytest.fixture @pytest.fixture
def i_has(en_tokenizer): def i_has(en_tokenizer):
doc = en_tokenizer("I has") doc = en_tokenizer("I has")
doc[0].morph_ = {"PronType": "prs"} doc[0].set_morph({"PronType": "prs"})
doc[1].morph_ = { doc[1].set_morph({
"VerbForm": "fin", "VerbForm": "fin",
"Tense": "pres", "Tense": "pres",
"Number": "sing", "Number": "sing",
"Person": "three", "Person": "three",
} })
return doc return doc
@ -47,20 +47,20 @@ def test_morph_get(i_has):
def test_morph_set(i_has): def test_morph_set(i_has):
assert i_has[0].morph.get("PronType") == ["prs"] assert i_has[0].morph.get("PronType") == ["prs"]
# set by string # set by string
i_has[0].morph_ = "PronType=unk" i_has[0].set_morph("PronType=unk")
assert i_has[0].morph.get("PronType") == ["unk"] assert i_has[0].morph.get("PronType") == ["unk"]
# set by string, fields are alphabetized # set by string, fields are alphabetized
i_has[0].morph_ = "PronType=123|NounType=unk" i_has[0].set_morph("PronType=123|NounType=unk")
assert i_has[0].morph_ == "NounType=unk|PronType=123" assert str(i_has[0].morph) == "NounType=unk|PronType=123"
# set by dict # set by dict
i_has[0].morph_ = {"AType": "123", "BType": "unk"} i_has[0].set_morph({"AType": "123", "BType": "unk"})
assert i_has[0].morph_ == "AType=123|BType=unk" assert str(i_has[0].morph) == "AType=123|BType=unk"
# set by string with multiple values, fields and values are alphabetized # set by string with multiple values, fields and values are alphabetized
i_has[0].morph_ = "BType=c|AType=b,a" i_has[0].set_morph("BType=c|AType=b,a")
assert i_has[0].morph_ == "AType=a,b|BType=c" assert str(i_has[0].morph) == "AType=a,b|BType=c"
# set by dict with multiple values, fields and values are alphabetized # set by dict with multiple values, fields and values are alphabetized
i_has[0].morph_ = {"AType": "b,a", "BType": "c"} i_has[0].set_morph({"AType": "b,a", "BType": "c"})
assert i_has[0].morph_ == "AType=a,b|BType=c" assert str(i_has[0].morph) == "AType=a,b|BType=c"
def test_morph_str(i_has): def test_morph_str(i_has):
@ -72,25 +72,25 @@ def test_morph_property(tokenizer):
doc = tokenizer("a dog") doc = tokenizer("a dog")
# set through token.morph_ # set through token.morph_
doc[0].morph_ = "PronType=prs" doc[0].set_morph("PronType=prs")
assert doc[0].morph_ == "PronType=prs" assert str(doc[0].morph) == "PronType=prs"
assert doc.to_array(["MORPH"])[0] != 0 assert doc.to_array(["MORPH"])[0] != 0
# unset with token.morph # unset with token.morph
doc[0].morph = 0 doc[0].set_morph(0)
assert doc.to_array(["MORPH"])[0] == 0 assert doc.to_array(["MORPH"])[0] == 0
# empty morph is equivalent to "_" # empty morph is equivalent to "_"
doc[0].morph_ = "" doc[0].set_morph("")
assert doc[0].morph_ == "" assert str(doc[0].morph) == ""
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"] assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
# "_" morph is also equivalent to empty morph # "_" morph is also equivalent to empty morph
doc[0].morph_ = "_" doc[0].set_morph("_")
assert doc[0].morph_ == "" assert str(doc[0].morph) == ""
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"] assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
# set through existing hash with token.morph # set through existing hash with token.morph
tokenizer.vocab.strings.add("Feat=Val") tokenizer.vocab.strings.add("Feat=Val")
doc[0].morph = tokenizer.vocab.strings.add("Feat=Val") doc[0].set_morph(tokenizer.vocab.strings.add("Feat=Val"))
assert doc[0].morph_ == "Feat=Val" assert str(doc[0].morph) == "Feat=Val"

View File

@ -21,11 +21,11 @@ def test_doc_retokenize_merge(en_tokenizer):
assert doc[4].text == "the beach boys" assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys " assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == "NAMED" assert doc[4].tag_ == "NAMED"
assert doc[4].morph_ == "Number=Plur" assert str(doc[4].morph) == "Number=Plur"
assert doc[5].text == "all night" assert doc[5].text == "all night"
assert doc[5].text_with_ws == "all night" assert doc[5].text_with_ws == "all night"
assert doc[5].tag_ == "NAMED" assert doc[5].tag_ == "NAMED"
assert doc[5].morph_ == "Number=Plur" assert str(doc[5].morph) == "Number=Plur"
def test_doc_retokenize_merge_children(en_tokenizer): def test_doc_retokenize_merge_children(en_tokenizer):
@ -201,6 +201,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15] heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"] tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)] ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
ents = ["O"] * len(heads)
ents[0] = "B-PERSON"
ents[1] = "I-PERSON"
ents[10] = "B-GPE"
ents[13] = "B-PERSON"
ents[14] = "I-PERSON"
# fmt: on # fmt: on
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = Doc( doc = Doc(
@ -269,7 +275,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
# if there is a parse, span.root provides default values # if there is a parse, span.root provides default values
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [0, 0, 3, 0, 0, 0, 5, 0, 0] heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)] ents = ["O"] * len(words)
ents[3] = "B-ent-de"
ents[4] = "I-ent-de"
ents[5] = "B-ent-fg"
ents[6] = "I-ent-fg"
deps = ["dep"] * len(words) deps = ["dep"] * len(words)
en_vocab.strings.add("ent-de") en_vocab.strings.add("ent-de")
en_vocab.strings.add("ent-fg") en_vocab.strings.add("ent-fg")
@ -292,7 +302,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
# check that B is preserved if span[start] is B # check that B is preserved if span[start] is B
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [0, 0, 3, 4, 0, 0, 5, 0, 0] heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
ents = [("ent-de", 3, 5), ("ent-de", 5, 7)] ents = ["O"] * len(words)
ents[3] = "B-ent-de"
ents[4] = "I-ent-de"
ents[5] = "B-ent-de"
ents[6] = "I-ent-de"
deps = ["dep"] * len(words) deps = ["dep"] * len(words)
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents) doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:

View File

@ -27,11 +27,11 @@ def test_doc_retokenize_split(en_vocab):
assert doc[0].text == "Los" assert doc[0].text == "Los"
assert doc[0].head.text == "Angeles" assert doc[0].head.text == "Angeles"
assert doc[0].idx == 0 assert doc[0].idx == 0
assert doc[0].morph_ == "Number=Sing" assert str(doc[0].morph) == "Number=Sing"
assert doc[1].idx == 3 assert doc[1].idx == 3
assert doc[1].text == "Angeles" assert doc[1].text == "Angeles"
assert doc[1].head.text == "start" assert doc[1].head.text == "start"
assert doc[1].morph_ == "Number=Sing" assert str(doc[1].morph) == "Number=Sing"
assert doc[2].text == "start" assert doc[2].text == "start"
assert doc[2].head.text == "." assert doc[2].head.text == "."
assert doc[3].text == "." assert doc[3].text == "."

View File

@ -9,7 +9,7 @@ def doc(en_vocab):
tags = ["VBP", "NN", "NN"] tags = ["VBP", "NN", "NN"]
heads = [0, 0, 0] heads = [0, 0, 0]
deps = ["ROOT", "dobj", "dobj"] deps = ["ROOT", "dobj", "dobj"]
ents = [("ORG", 1, 2)] ents = ["O", "B-ORG", "O"]
return Doc( return Doc(
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
) )

View File

@ -236,13 +236,13 @@ def test_matcher_subset_value_operator(en_vocab):
matcher.add("M", [pattern]) matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"]) doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 3 assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val" doc[0].set_morph("Feat=Val")
assert len(matcher(doc)) == 3 assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val|Feat2=Val2" doc[0].set_morph("Feat=Val|Feat2=Val2")
assert len(matcher(doc)) == 3 assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3" doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4" doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
# IS_SUBSET acts like "IN" for attrs other than MORPH # IS_SUBSET acts like "IN" for attrs other than MORPH
@ -268,11 +268,11 @@ def test_matcher_superset_value_operator(en_vocab):
matcher.add("M", [pattern]) matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"]) doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat=Val|Feat2=Val2" doc[0].set_morph("Feat=Val|Feat2=Val2")
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3" doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4" doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
# IS_SUPERSET with more than one value only matches for MORPH # IS_SUPERSET with more than one value only matches for MORPH
@ -310,9 +310,9 @@ def test_matcher_morph_handling(en_vocab):
doc = Doc(en_vocab, words=["a", "b", "c"]) doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat2=Val2|Feat1=Val1" doc[0].set_morph("Feat2=Val2|Feat1=Val1")
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
doc[0].morph_ = "Feat1=Val1|Feat2=Val2" doc[0].set_morph("Feat1=Val1|Feat2=Val2")
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
# multiple values are split # multiple values are split
@ -324,9 +324,9 @@ def test_matcher_morph_handling(en_vocab):
doc = Doc(en_vocab, words=["a", "b", "c"]) doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1" doc[0].set_morph("Feat2=Val2,Val3|Feat1=Val1")
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2" doc[0].set_morph("Feat1=Val1,Val3|Feat2=Val2")
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab):
doc2 = Doc(en_vocab, words=["Test"]) doc2 = Doc(en_vocab, words=["Test"])
doc2[0].tag_ = "TAG" doc2[0].tag_ = "TAG"
doc2[0].pos_ = "X" doc2[0].pos_ = "X"
doc2[0].morph_ = "Feat=Val" doc2[0].set_morph("Feat=Val")
doc2[0].lemma_ = "LEMMA" doc2[0].lemma_ = "LEMMA"
doc3 = Doc(en_vocab, words=["Test"]) doc3 = Doc(en_vocab, words=["Test"])
# DEP requires DEP # DEP requires DEP

View File

@ -190,7 +190,7 @@ def test_phrase_matcher_validation(en_vocab):
doc2 = Doc(en_vocab, words=["Test"]) doc2 = Doc(en_vocab, words=["Test"])
doc2[0].tag_ = "TAG" doc2[0].tag_ = "TAG"
doc2[0].pos_ = "X" doc2[0].pos_ = "X"
doc2[0].morph_ = "Feat=Val" doc2[0].set_morph("Feat=Val")
doc3 = Doc(en_vocab, words=["Test"]) doc3 = Doc(en_vocab, words=["Test"])
matcher = PhraseMatcher(en_vocab, validate=True) matcher = PhraseMatcher(en_vocab, validate=True)
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
@ -217,7 +217,7 @@ def test_attr_pipeline_checks(en_vocab):
doc2 = Doc(en_vocab, words=["Test"]) doc2 = Doc(en_vocab, words=["Test"])
doc2[0].tag_ = "TAG" doc2[0].tag_ = "TAG"
doc2[0].pos_ = "X" doc2[0].pos_ = "X"
doc2[0].morph_ = "Feat=Val" doc2[0].set_morph("Feat=Val")
doc2[0].lemma_ = "LEMMA" doc2[0].lemma_ = "LEMMA"
doc3 = Doc(en_vocab, words=["Test"]) doc3 = Doc(en_vocab, words=["Test"])
# DEP requires DEP # DEP requires DEP

View File

@ -339,7 +339,6 @@ def test_ner_warns_no_lookups(caplog):
nlp.vocab.lookups = Lookups() nlp.vocab.lookups = Lookups()
assert not len(nlp.vocab.lookups) assert not len(nlp.vocab.lookups)
nlp.add_pipe("ner") nlp.add_pipe("ner")
nlp.config["initialize"]["lookups"] = None
with caplog.at_level(logging.DEBUG): with caplog.at_level(logging.DEBUG):
nlp.initialize() nlp.initialize()
assert "W033" in caplog.text assert "W033" in caplog.text

View File

@ -69,9 +69,9 @@ def test_attributeruler_init(nlp, pattern_dicts):
a.add(**p) a.add(**p)
doc = nlp("This is a test.") doc = nlp("This is a test.")
assert doc[2].lemma_ == "the" assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur" assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat" assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing" assert str(doc[3].morph) == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA") assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH") assert doc.has_annotation("MORPH")
@ -81,9 +81,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
doc = nlp("This is a test.") doc = nlp("This is a test.")
assert doc[2].lemma_ == "the" assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur" assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat" assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing" assert str(doc[3].morph) == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA") assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH") assert doc.has_annotation("MORPH")
nlp.remove_pipe("attribute_ruler") nlp.remove_pipe("attribute_ruler")
@ -94,9 +94,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
) )
doc = nlp("This is a test.") doc = nlp("This is a test.")
assert doc[2].lemma_ == "the" assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur" assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat" assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing" assert str(doc[3].morph) == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA") assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH") assert doc.has_annotation("MORPH")
@ -106,9 +106,9 @@ def test_attributeruler_score(nlp, pattern_dicts):
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
doc = nlp("This is a test.") doc = nlp("This is a test.")
assert doc[2].lemma_ == "the" assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur" assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat" assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing" assert str(doc[3].morph) == "Case=Nom|Number=Sing"
dev_examples = [ dev_examples = [
Example.from_dict( Example.from_dict(
@ -150,10 +150,10 @@ def test_attributeruler_tag_map(nlp, tag_map):
for i in range(len(doc)): for i in range(len(doc)):
if i == 4: if i == 4:
assert doc[i].pos_ == "PUNCT" assert doc[i].pos_ == "PUNCT"
assert doc[i].morph_ == "PunctType=peri" assert str(doc[i].morph) == "PunctType=peri"
else: else:
assert doc[i].pos_ == "" assert doc[i].pos_ == ""
assert doc[i].morph_ == "" assert str(doc[i].morph) == ""
def test_attributeruler_morph_rules(nlp, morph_rules): def test_attributeruler_morph_rules(nlp, morph_rules):
@ -168,11 +168,11 @@ def test_attributeruler_morph_rules(nlp, morph_rules):
for i in range(len(doc)): for i in range(len(doc)):
if i != 2: if i != 2:
assert doc[i].pos_ == "" assert doc[i].pos_ == ""
assert doc[i].morph_ == "" assert str(doc[i].morph) == ""
else: else:
assert doc[2].pos_ == "DET" assert doc[2].pos_ == "DET"
assert doc[2].lemma_ == "a" assert doc[2].lemma_ == "a"
assert doc[2].morph_ == "Case=Nom" assert str(doc[2].morph) == "Case=Nom"
def test_attributeruler_indices(nlp): def test_attributeruler_indices(nlp):
@ -194,14 +194,14 @@ def test_attributeruler_indices(nlp):
for i in range(len(doc)): for i in range(len(doc)):
if i == 1: if i == 1:
assert doc[i].lemma_ == "was" assert doc[i].lemma_ == "was"
assert doc[i].morph_ == "Case=Nom|Number=Sing" assert str(doc[i].morph) == "Case=Nom|Number=Sing"
elif i == 2: elif i == 2:
assert doc[i].lemma_ == "the" assert doc[i].lemma_ == "the"
assert doc[i].morph_ == "Case=Nom|Number=Plur" assert str(doc[i].morph) == "Case=Nom|Number=Plur"
elif i == 3: elif i == 3:
assert doc[i].lemma_ == "cat" assert doc[i].lemma_ == "cat"
else: else:
assert doc[i].morph_ == "" assert str(doc[i].morph) == ""
# raises an error when trying to modify a token outside of the match # raises an error when trying to modify a token outside of the match
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2) a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
with pytest.raises(ValueError): with pytest.raises(ValueError):

View File

@ -91,7 +91,7 @@ def test_overfitting_IO():
doc = nlp(test_text) doc = nlp(test_text)
gold_morphs = ["Feat=N", "Feat=V", "", ""] gold_morphs = ["Feat=N", "Feat=V", "", ""]
gold_pos_tags = ["NOUN", "VERB", "ADJ", ""] gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
assert [t.morph_ for t in doc] == gold_morphs assert [str(t.morph) for t in doc] == gold_morphs
assert [t.pos_ for t in doc] == gold_pos_tags assert [t.pos_ for t in doc] == gold_pos_tags
# Also test the results are still the same after IO # Also test the results are still the same after IO
@ -99,5 +99,5 @@ def test_overfitting_IO():
nlp.to_disk(tmp_dir) nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2(test_text) doc2 = nlp2(test_text)
assert [t.morph_ for t in doc2] == gold_morphs assert [str(t.morph) for t in doc2] == gold_morphs
assert [t.pos_ for t in doc2] == gold_pos_tags assert [t.pos_ for t in doc2] == gold_pos_tags

View File

@ -59,7 +59,7 @@ def test_issue3012(en_vocab):
words = ["This", "is", "10", "%", "."] words = ["This", "is", "10", "%", "."]
tags = ["DT", "VBZ", "CD", "NN", "."] tags = ["DT", "VBZ", "CD", "NN", "."]
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"] pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
ents = [("PERCENT", 2, 4)] ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents) doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
assert doc.has_annotation("TAG") assert doc.has_annotation("TAG")
expected = ("10", "NUM", "CD", "PERCENT") expected = ("10", "NUM", "CD", "PERCENT")

View File

@ -76,7 +76,7 @@ def tagged_doc():
for i in range(len(tags)): for i in range(len(tags)):
doc[i].tag_ = tags[i] doc[i].tag_ = tags[i]
doc[i].pos_ = pos[i] doc[i].pos_ = pos[i]
doc[i].morph_ = morphs[i] doc[i].set_morph(morphs[i])
if i > 0: if i > 0:
doc[i].is_sent_start = False doc[i].is_sent_start = False
return doc return doc
@ -184,7 +184,7 @@ def test_ner_per_type(en_vocab):
doc = Doc( doc = Doc(
en_vocab, en_vocab,
words=input_.split(" "), words=input_.split(" "),
ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)], ents=["B-CARDINAL", "O", "B-CARDINAL"],
) )
entities = offsets_to_biluo_tags(doc, annot["entities"]) entities = offsets_to_biluo_tags(doc, annot["entities"])
example = Example.from_dict(doc, {"entities": entities}) example = Example.from_dict(doc, {"entities": entities})
@ -209,7 +209,7 @@ def test_ner_per_type(en_vocab):
doc = Doc( doc = Doc(
en_vocab, en_vocab,
words=input_.split(" "), words=input_.split(" "),
ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)], ents=["B-ORG", "O", "O", "O", "O", "B-GPE", "B-ORG", "O", "O", "O"],
) )
entities = offsets_to_biluo_tags(doc, annot["entities"]) entities = offsets_to_biluo_tags(doc, annot["entities"])
example = Example.from_dict(doc, {"entities": entities}) example = Example.from_dict(doc, {"entities": entities})
@ -242,7 +242,7 @@ def test_tag_score(tagged_doc):
gold = { gold = {
"tags": [t.tag_ for t in tagged_doc], "tags": [t.tag_ for t in tagged_doc],
"pos": [t.pos_ for t in tagged_doc], "pos": [t.pos_ for t in tagged_doc],
"morphs": [t.morph_ for t in tagged_doc], "morphs": [str(t.morph) for t in tagged_doc],
"sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc], "sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc],
} }
example = Example.from_dict(tagged_doc, gold) example = Example.from_dict(tagged_doc, gold)
@ -259,7 +259,7 @@ def test_tag_score(tagged_doc):
tags[0] = "NN" tags[0] = "NN"
pos = [t.pos_ for t in tagged_doc] pos = [t.pos_ for t in tagged_doc]
pos[1] = "X" pos[1] = "X"
morphs = [t.morph_ for t in tagged_doc] morphs = [str(t.morph) for t in tagged_doc]
morphs[1] = "Number=sing" morphs[1] = "Number=sing"
morphs[2] = "Number=plur" morphs[2] = "Number=plur"
gold = { gold = {

View File

@ -113,7 +113,7 @@ def test_Example_from_dict_with_morphology(annots):
predicted = Doc(vocab, words=annots["words"]) predicted = Doc(vocab, words=annots["words"])
example = Example.from_dict(predicted, annots) example = Example.from_dict(predicted, annots)
for i, token in enumerate(example.reference): for i, token in enumerate(example.reference):
assert token.morph_ == annots["morphs"][i] assert str(token.morph) == annots["morphs"][i]
@pytest.mark.parametrize( @pytest.mark.parametrize(

View File

@ -30,7 +30,12 @@ def doc(en_vocab):
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5] heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"] deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."] lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
ents = (("PERSON", 0, 2), ("LOC", 5, 7), ("GPE", 8, 9)) ents = ["O"] * len(words)
ents[0] = "B-PERSON"
ents[1] = "I-PERSON"
ents[5] = "B-LOC"
ents[6] = "I-LOC"
ents[8] = "B-GPE"
cats = {"TRAVEL": 1.0, "BAKING": 0.0} cats = {"TRAVEL": 1.0, "BAKING": 0.0}
# fmt: on # fmt: on
doc = Doc( doc = Doc(
@ -455,7 +460,7 @@ def test_roundtrip_docs_to_docbin(doc):
idx = [t.idx for t in doc] idx = [t.idx for t in doc]
tags = [t.tag_ for t in doc] tags = [t.tag_ for t in doc]
pos = [t.pos_ for t in doc] pos = [t.pos_ for t in doc]
morphs = [t.morph_ for t in doc] morphs = [str(t.morph) for t in doc]
lemmas = [t.lemma_ for t in doc] lemmas = [t.lemma_ for t in doc]
deps = [t.dep_ for t in doc] deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc] heads = [t.head.i for t in doc]
@ -477,7 +482,7 @@ def test_roundtrip_docs_to_docbin(doc):
assert idx == [t.idx for t in reloaded_example.reference] assert idx == [t.idx for t in reloaded_example.reference]
assert tags == [t.tag_ for t in reloaded_example.reference] assert tags == [t.tag_ for t in reloaded_example.reference]
assert pos == [t.pos_ for t in reloaded_example.reference] assert pos == [t.pos_ for t in reloaded_example.reference]
assert morphs == [t.morph_ for t in reloaded_example.reference] assert morphs == [str(t.morph) for t in reloaded_example.reference]
assert lemmas == [t.lemma_ for t in reloaded_example.reference] assert lemmas == [t.lemma_ for t in reloaded_example.reference]
assert deps == [t.dep_ for t in reloaded_example.reference] assert deps == [t.dep_ for t in reloaded_example.reference]
assert heads == [t.head.i for t in reloaded_example.reference] assert heads == [t.head.i for t in reloaded_example.reference]

View File

@ -101,7 +101,7 @@ class DocBin:
self.strings.add(token.text) self.strings.add(token.text)
self.strings.add(token.tag_) self.strings.add(token.tag_)
self.strings.add(token.lemma_) self.strings.add(token.lemma_)
self.strings.add(token.morph_) self.strings.add(str(token.morph))
self.strings.add(token.dep_) self.strings.add(token.dep_)
self.strings.add(token.ent_type_) self.strings.add(token.ent_type_)
self.strings.add(token.ent_kb_id_) self.strings.add(token.ent_kb_id_)

View File

@ -213,8 +213,9 @@ cdef class Doc:
sent_starts (Optional[List[Union[bool, None]]]): A list of values, of sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
the same length as words, to assign as token.is_sent_start. Will be the same length as words, to assign as token.is_sent_start. Will be
overridden by heads if heads is provided. Defaults to None. overridden by heads if heads is provided. Defaults to None.
ents (Optional[List[Tuple[Union[str, int], int, int]]]): A list of ents (Optional[List[str]]): A list of unicode strings, of the same
(label, start, end) tuples to assign as doc.ents. Defaults to None. length as words, as IOB tags to assign as token.ent_iob and
token.ent_type. Defaults to None.
DOCS: https://nightly.spacy.io/api/doc#init DOCS: https://nightly.spacy.io/api/doc#init
""" """
@ -275,16 +276,55 @@ cdef class Doc:
sent_starts[i] = -1 sent_starts[i] = -1
elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]: elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
sent_starts[i] = 0 sent_starts[i] = 0
ent_iobs = None
ent_types = None
if ents is not None:
iob_strings = Token.iob_strings()
# make valid IOB2 out of IOB1 or IOB2
for i, ent in enumerate(ents):
if ent is "":
ents[i] = None
elif ent is not None and not isinstance(ent, str):
raise ValueError(Errors.E177.format(tag=ent))
if i < len(ents) - 1:
# OI -> OB
if (ent is None or ent.startswith("O")) and \
(ents[i+1] is not None and ents[i+1].startswith("I")):
ents[i+1] = "B" + ents[i+1][1:]
# B-TYPE1 I-TYPE2 or I-TYPE1 I-TYPE2 -> B/I-TYPE1 B-TYPE2
if ent is not None and ents[i+1] is not None and \
(ent.startswith("B") or ent.startswith("I")) and \
ents[i+1].startswith("I") and \
ent[1:] != ents[i+1][1:]:
ents[i+1] = "B" + ents[i+1][1:]
ent_iobs = []
ent_types = []
for ent in ents:
if ent is None:
ent_iobs.append(iob_strings.index(""))
ent_types.append("")
elif ent == "O":
ent_iobs.append(iob_strings.index(ent))
ent_types.append("")
else:
if len(ent) < 3 or ent[1] != "-":
raise ValueError(Errors.E177.format(tag=ent))
ent_iob, ent_type = ent.split("-", 1)
if ent_iob not in iob_strings:
raise ValueError(Errors.E177.format(tag=ent))
ent_iob = iob_strings.index(ent_iob)
ent_iobs.append(ent_iob)
ent_types.append(ent_type)
headings = [] headings = []
values = [] values = []
annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts] annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts, ent_iobs, ent_types]
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START] possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START, ENT_IOB, ENT_TYPE]
for a, annot in enumerate(annotations): for a, annot in enumerate(annotations):
if annot is not None: if annot is not None:
if len(annot) != len(words): if len(annot) != len(words):
raise ValueError(Errors.E189) raise ValueError(Errors.E189)
headings.append(possible_headings[a]) headings.append(possible_headings[a])
if annot is not heads and annot is not sent_starts: if annot is not heads and annot is not sent_starts and annot is not ent_iobs:
values.extend(annot) values.extend(annot)
for value in values: for value in values:
self.vocab.strings.add(value) self.vocab.strings.add(value)
@ -296,7 +336,7 @@ cdef class Doc:
j = 0 j = 0
for annot in annotations: for annot in annotations:
if annot: if annot:
if annot is heads or annot is sent_starts: if annot is heads or annot is sent_starts or annot is ent_iobs:
for i in range(len(words)): for i in range(len(words)):
if attrs.ndim == 1: if attrs.ndim == 1:
attrs[i] = annot[i] attrs[i] = annot[i]
@ -317,8 +357,6 @@ cdef class Doc:
attrs[i, j] = self.vocab.strings[annot[i]] attrs[i, j] = self.vocab.strings[annot[i]]
j += 1 j += 1
self.from_array(headings, attrs) self.from_array(headings, attrs)
if ents is not None:
self.ents = ents
@property @property
def _(self): def _(self):
@ -1210,7 +1248,7 @@ cdef class Doc:
for token in self: for token in self:
strings.add(token.tag_) strings.add(token.tag_)
strings.add(token.lemma_) strings.add(token.lemma_)
strings.add(token.morph_) strings.add(str(token.morph))
strings.add(token.dep_) strings.add(token.dep_)
strings.add(token.ent_type_) strings.add(token.ent_type_)
strings.add(token.ent_kb_id_) strings.add(token.ent_kb_id_)

View File

@ -215,20 +215,20 @@ cdef class Token:
def __get__(self): def __get__(self):
return MorphAnalysis.from_id(self.vocab, self.c.morph) return MorphAnalysis.from_id(self.vocab, self.c.morph)
def __set__(self, attr_t morph): def __set__(self, MorphAnalysis morph):
if morph == 0: # Check that the morph has the same vocab
self.c.morph = morph if self.vocab != morph.vocab:
elif morph in self.vocab.strings: raise ValueError(Errors.E1013)
self.morph_ = self.vocab.strings[morph] self.c.morph = morph.c.key
def set_morph(self, features):
cdef hash_t key
if features is 0:
self.c.morph = 0
else: else:
raise ValueError(Errors.E1009.format(val=morph)) if isinstance(features, int):
features = self.vocab.strings[features]
property morph_: key = self.vocab.morphology.add(features)
def __get__(self):
return str(MorphAnalysis.from_id(self.vocab, self.c.morph))
def __set__(self, features):
cdef hash_t key = self.vocab.morphology.add(features)
self.c.morph = key self.c.morph = key
@property @property

View File

@ -207,6 +207,7 @@ def conllu_sentence_to_doc(
pos=poses, pos=poses,
deps=deps, deps=deps,
lemmas=lemmas, lemmas=lemmas,
morphs=morphs,
heads=heads, heads=heads,
) )
for i in range(len(doc)): for i in range(len(doc)):

View File

@ -1,4 +1,4 @@
from collections import Iterable as IterableInstance from collections.abc import Iterable as IterableInstance
import warnings import warnings
import numpy import numpy
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
@ -226,7 +226,7 @@ cdef class Example:
"TAG": [t.tag_ for t in self.reference], "TAG": [t.tag_ for t in self.reference],
"LEMMA": [t.lemma_ for t in self.reference], "LEMMA": [t.lemma_ for t in self.reference],
"POS": [t.pos_ for t in self.reference], "POS": [t.pos_ for t in self.reference],
"MORPH": [t.morph_ for t in self.reference], "MORPH": [str(t.morph) for t in self.reference],
"HEAD": [t.head.i for t in self.reference], "HEAD": [t.head.i for t in self.reference],
"DEP": [t.dep_ for t in self.reference], "DEP": [t.dep_ for t in self.reference],
"SENT_START": [int(bool(t.is_sent_start)) for t in self.reference] "SENT_START": [int(bool(t.is_sent_start)) for t in self.reference]

View File

@ -44,7 +44,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
if include_annotation["POS"]: if include_annotation["POS"]:
json_token["pos"] = token.pos_ json_token["pos"] = token.pos_
if include_annotation["MORPH"]: if include_annotation["MORPH"]:
json_token["morph"] = token.morph_ json_token["morph"] = str(token.morph)
if include_annotation["LEMMA"]: if include_annotation["LEMMA"]:
json_token["lemma"] = token.lemma_ json_token["lemma"] = token.lemma_
if include_annotation["DEP"]: if include_annotation["DEP"]:

View File

@ -144,9 +144,9 @@ argument that connects to the shared `tok2vec` component in the pipeline.
Construct an embedding layer that separately embeds a number of lexical Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it through attributes using hash embedding, concatenates the results, and passes it through
a feed-forward subnetwork to build mixed representations. The features used are a feed-forward subnetwork to build mixed representations. The features used are
the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying definitions the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a
depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained [FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static
static vectors can also be incorporated into the concatenated representation. vectors can also be incorporated into the concatenated representation.
| Name | Description | | Name | Description |
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -291,6 +291,24 @@ on [static vectors](/usage/embeddings-transformers#static-vectors) for details.
| `key_attr` | Defaults to `"ORTH"`. ~~str~~ | | `key_attr` | Defaults to `"ORTH"`. ~~str~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ |
### spacy.FeatureExtractor.v1 {#FeatureExtractor}
> #### Example config
>
> ```ini
> [model]
> @architectures = "spacy.FeatureExtractor.v1"
> columns = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
> ```
Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list
of feature names to extract, which should refer to token attributes.
| Name |  Description |
| ----------- | ------------------------------------------------------------------------ |
| `columns` | The token attributes to extract. ~~List[Union[int, str]]~~ |
| **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"} ## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
The following architectures are provided by the package The following architectures are provided by the package

View File

@ -186,15 +186,14 @@ This functionality was previously available as part of the command `init-model`.
</Infobox> </Infobox>
```cli ```cli
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--lexemes-jsonl] [--verbose] $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose]
``` ```
| Name | Description | | Name | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ | | `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ | | `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ | | `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
| `--lexemes-jsonl`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~ |
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ | | `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ | | `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
@ -202,6 +201,39 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. | | **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
### init labels {#init-labels new="3" tag="command"}
Generate JSON files for the labels in the data. This helps speed up the training
process, since spaCy won't have to preprocess the data to extract the labels.
After generating the labels, you can provide them to components that accept a
`labels` argument on initialization via the
[`[initialize]`](/api/data-formats#config-initialize) block of your config.
> #### Example config
>
> ```ini
> [initialize.components.ner]
>
> [initialize.components.ner.labels]
> @readers = "spacy.read_labels.v1"
> path = "corpus/labels/ner.json
> ```
```cli
$ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [--gpu-id] [overrides]
```
| Name | Description |
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **CREATES** | The final trained pipeline and the best trained pipeline. |
## convert {#convert tag="command"} ## convert {#convert tag="command"}
Convert files into spaCy's Convert files into spaCy's

View File

@ -238,8 +238,6 @@ without requiring them at runtime when you load the trained pipeline back in.
> data_path = "/path/to/component_data" > data_path = "/path/to/component_data"
> ``` > ```
<!-- TODO: -->
| Name | Description | | Name | Description |
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `components` | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~ | | `components` | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~ |
@ -454,15 +452,20 @@ example = Example.from_dict(doc, gold_dict)
## Lexical data for vocabulary {#vocab-jsonl new="2"} ## Lexical data for vocabulary {#vocab-jsonl new="2"}
To populate a pipeline's vocabulary, you can use the This data file can be provided via the `vocab_data` setting in the
[`spacy init vectors`](/api/cli#init-vectors) command and load in a `[initialize]` block of the training config to pre-define the lexical data to
[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one initialize the `nlp` object's vocabulary with. The file should contain one
lexical entry per line via the `--jsonl-loc` option. The first line defines the lexical entry per line. The first line defines the language and vocabulary
language and vocabulary settings. All other lines are expected to be JSON settings. All other lines are expected to be JSON objects describing an
objects describing an individual lexeme. The lexical attributes will be then set individual lexeme. The lexical attributes will be then set as attributes on
as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab` spaCy's [`Lexeme`](/api/lexeme#attributes) object.
command outputs a ready-to-use spaCy pipeline with a `Vocab` containing the
lexical data. > #### Example config
>
> ```ini
> [initialize]
> vocab_data = "/path/to/vocab-data.jsonl"
> ```
```python ```python
### First line ### First line

View File

@ -21,8 +21,9 @@ non-projective parses.
The parser is trained using an **imitation learning objective**. It follows the The parser is trained using an **imitation learning objective**. It follows the
actions predicted by the current weights, and at each state, determines which actions predicted by the current weights, and at each state, determines which
actions are compatible with the optimal parse that could be reached from the actions are compatible with the optimal parse that could be reached from the
current state. The weights are updated such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note current state. The weights are updated such that the scores assigned to the set
that more than one action may be optimal for a given state. of optimal actions is increased, while scores assigned to other actions are
decreased. Note that more than one action may be optimal for a given state.
## Config and implementation {#config} ## Config and implementation {#config}
@ -139,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## DependencyParser.initialize {#initialize tag="method"} ## DependencyParser.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are returns an iterable of [`Example`](/api/example) objects. The data examples are
@ -148,7 +149,10 @@ training data or a representative sample. Initialization includes validating the
network, network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize). by [`Language.initialize`](/api/language#initialize) and lets you customize
arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
config.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training"> <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
@ -162,12 +166,22 @@ This method was previously called `begin_training`.
> parser = nlp.add_pipe("parser") > parser = nlp.add_pipe("parser")
> parser.initialize(lambda: [], nlp=nlp) > parser.initialize(lambda: [], nlp=nlp)
> ``` > ```
>
> ```ini
> ### config.cfg
> [initialize.components.parser]
>
> [initialize.components.parser.labels]
> @readers = "spacy.read_labels.v1"
> path = "corpus/labels/parser.json
> ```
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
## DependencyParser.predict {#predict tag="method"} ## DependencyParser.predict {#predict tag="method"}

View File

@ -32,7 +32,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
> ``` > ```
| Name | Description | | Name | Description |
| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | A storage container for lexical types. ~~Vocab~~ | | `vocab` | A storage container for lexical types. ~~Vocab~~ |
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ | | `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | | `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
@ -45,7 +45,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ | | `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | | `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ | | `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ |
| `ents` <Tag variant="new">3</Tag> | A list of `(label, start, end)` tuples to assign as `doc.ents`. Note that the `start` and `end` indices here refer to the token indices. Defaults to `None`. ~~Optional[List[Tuple[Union[str, int], int, int]]]~~ | | `ents` <Tag variant="new">3</Tag> | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
## Doc.\_\_getitem\_\_ {#getitem tag="method"} ## Doc.\_\_getitem\_\_ {#getitem tag="method"}
@ -503,7 +503,9 @@ invalidated, although they may accidentally continue to work.
Mark a span for merging. The `attrs` will be applied to the resulting token (if Mark a span for merging. The `attrs` will be applied to the resulting token (if
they're context-dependent token attributes like `LEMMA` or `DEP`) or to the they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
underlying lexeme (if they're context-independent lexical attributes like underlying lexeme (if they're context-independent lexical attributes like
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute names to values. `LOWER` or `IS_STOP`). Writable custom extension attributes can be provided
using the `"_"` key and specifying a dictionary that maps attribute names to
values.
> #### Example > #### Example
> >

View File

@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## EntityLinker.initialize {#initialize tag="method"} ## EntityLinker.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are returns an iterable of [`Example`](/api/example) objects. The data examples are

View File

@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## EntityRecognizer.initialize {#initialize tag="method"} ## EntityRecognizer.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are returns an iterable of [`Example`](/api/example) objects. The data examples are
@ -138,7 +138,10 @@ training data or a representative sample. Initialization includes validating the
network, network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize). by [`Language.initialize`](/api/language#initialize) and lets you customize
arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
config.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training"> <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
@ -152,12 +155,22 @@ This method was previously called `begin_training`.
> ner = nlp.add_pipe("ner") > ner = nlp.add_pipe("ner")
> ner.initialize(lambda: [], nlp=nlp) > ner.initialize(lambda: [], nlp=nlp)
> ``` > ```
>
> ```ini
> ### config.cfg
> [initialize.components.ner]
>
> [initialize.components.ner.labels]
> @readers = "spacy.read_labels.v1"
> path = "corpus/labels/ner.json
> ```
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
## EntityRecognizer.predict {#predict tag="method"} ## EntityRecognizer.predict {#predict tag="method"}

View File

@ -202,7 +202,7 @@ more efficient than processing texts one-by-one.
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ | | `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ | | **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
## Language.initialize {#initialize tag="method"} ## Language.initialize {#initialize tag="method" new="3"}
Initialize the pipeline for training and return an Initialize the pipeline for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the [`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the

View File

@ -126,7 +126,10 @@ training data or a representative sample. Initialization includes validating the
network, network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize). by [`Language.initialize`](/api/language#initialize) and lets you customize
arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
config.
> #### Example > #### Example
> >
@ -134,12 +137,22 @@ by [`Language.initialize`](/api/language#initialize).
> morphologizer = nlp.add_pipe("morphologizer") > morphologizer = nlp.add_pipe("morphologizer")
> morphologizer.initialize(lambda: [], nlp=nlp) > morphologizer.initialize(lambda: [], nlp=nlp)
> ``` > ```
>
> ```ini
> ### config.cfg
> [initialize.components.morphologizer]
>
> [initialize.components.morphologizer.labels]
> @readers = "spacy.read_labels.v1"
> path = "corpus/labels/morphologizer.json
> ```
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
## Morphologizer.predict {#predict tag="method"} ## Morphologizer.predict {#predict tag="method"}

View File

@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## Pipe.initialize {#initialize tag="method"} ## Pipe.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are returns an iterable of [`Example`](/api/example) objects. The data examples are

View File

@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## Tagger.initialize {#initialize tag="method"} ## Tagger.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are returns an iterable of [`Example`](/api/example) objects. The data examples are
@ -121,7 +121,10 @@ training data or a representative sample. Initialization includes validating the
network, network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize). by [`Language.initialize`](/api/language#initialize) and lets you customize
arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
config.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training"> <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
@ -135,12 +138,22 @@ This method was previously called `begin_training`.
> tagger = nlp.add_pipe("tagger") > tagger = nlp.add_pipe("tagger")
> tagger.initialize(lambda: [], nlp=nlp) > tagger.initialize(lambda: [], nlp=nlp)
> ``` > ```
>
> ```ini
> ### config.cfg
> [initialize.components.tagger]
>
> [initialize.components.tagger.labels]
> @readers = "spacy.read_labels.v1"
> path = "corpus/labels/tagger.json
> ```
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[list]~~ |
## Tagger.predict {#predict tag="method"} ## Tagger.predict {#predict tag="method"}

View File

@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## TextCategorizer.initialize {#initialize tag="method"} ## TextCategorizer.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are returns an iterable of [`Example`](/api/example) objects. The data examples are
@ -134,7 +134,10 @@ training data or a representative sample. Initialization includes validating the
network, network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize). by [`Language.initialize`](/api/language#initialize) and lets you customize
arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
config.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training"> <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
@ -148,12 +151,22 @@ This method was previously called `begin_training`.
> textcat = nlp.add_pipe("textcat") > textcat = nlp.add_pipe("textcat")
> textcat.initialize(lambda: [], nlp=nlp) > textcat.initialize(lambda: [], nlp=nlp)
> ``` > ```
>
> ```ini
> ### config.cfg
> [initialize.components.textcat]
>
> [initialize.components.textcat.labels]
> @readers = "spacy.read_labels.v1"
> path = "corpus/labels/textcat.json
> ```
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
## TextCategorizer.predict {#predict tag="method"} ## TextCategorizer.predict {#predict tag="method"}

View File

@ -538,6 +538,32 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
| **CREATES** | The corpus reader. ~~JsonlTexts~~ | | **CREATES** | The corpus reader. ~~JsonlTexts~~ |
### spacy.read_labels.v1 {#read_labels tag="registered function"}
Read a JSON-formatted labels file generated with
[`init labels`](/api/cli#init-labels). Typically used in the
[`[initialize]`](/api/data-formats#config-initialize) block of the training
config to speed up the model initialization process and provide pre-generated
label sets.
> #### Example config
>
> ```ini
> [initialize.components]
>
> [initialize.components.ner]
>
> [initialize.components.ner.labels]
> @readers = "spacy.read_labels.v1"
> path = "corpus/labels/ner.json"
> ```
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~ |
| `require` | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
| **CREATES** | The |
## Batchers {#batchers source="spacy/training/batchers.py" new="3"} ## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
A data batcher implements a batching strategy that essentially turns a stream of A data batcher implements a batching strategy that essentially turns a stream of

View File

@ -585,8 +585,9 @@ vectors, but combines them via summation with a smaller table of learned
embeddings. embeddings.
```python ```python
from thinc.api import add, chain, remap_ids, Embed, FeatureExtractor from thinc.api import add, chain, remap_ids, Embed
from spacy.ml.staticvectors import StaticVectors from spacy.ml.staticvectors import StaticVectors
from spacy.ml.featureextractor import FeatureExtractor
from spacy.util import registry from spacy.util import registry
@registry.architectures("my_example.MyEmbedding.v1") @registry.architectures("my_example.MyEmbedding.v1")

View File

@ -204,7 +204,19 @@ initialize it.
![Illustration of pipeline lifecycle](../images/lifecycle.svg) ![Illustration of pipeline lifecycle](../images/lifecycle.svg)
<!-- TODO: explain lifecycle and initialization --> At runtime spaCy will only use the `[nlp]` and `[components]` blocks of the
config and load all data, including tokenization rules, model weights and other
resources from the pipeline directory. The `[training]` block contains the
settings for training the model and is only used during training. Similarly, the
`[initialize]` block defines how the initial `nlp` object should be set up
before training and whether it should be initialized with vectors or pretrained
tok2vec weights, or any other data needed by the components.
The initialization settings are only loaded and used when
[`nlp.initialize`](/api/language#initialize) is called (typically right before
training). This allows you to set up your pipeline using local data resources
and custom functions, and preserve the information in your config but without
requiring it to be available at runtime
### Overwriting config settings on the command line {#config-overrides} ### Overwriting config settings on the command line {#config-overrides}
@ -803,6 +815,10 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
return create_model(output_width) return create_model(output_width)
``` ```
<!-- TODO:
### Customizing the initialization {#initialization}
-->
## Data utilities {#data} ## Data utilities {#data}
spaCy includes various features and utilities to make it easy to train models spaCy includes various features and utilities to make it easy to train models
@ -853,7 +869,7 @@ nlp = spacy.blank("en")
docbin = DocBin(nlp.vocab) docbin = DocBin(nlp.vocab)
words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."] words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
spaces = [True, True, True, True, True, True, True, False] spaces = [True, True, True, True, True, True, True, False]
ents = [("ORG", 0, 1), ("GPE", 5, 6)] ents = ["B-ORG", "O", "O", "O", "O", "B-GPE", "O", "O"]
doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents) doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
docbin.add(doc) docbin.add(doc)
docbin.to_disk("./train.spacy") docbin.to_disk("./train.spacy")

View File

@ -104,7 +104,6 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
> >
> ```ini > ```ini
> [training] > [training]
> vectors = null
> accumulate_gradient = 3 > accumulate_gradient = 3
> >
> [training.optimizer] > [training.optimizer]
@ -430,6 +429,8 @@ The following methods, attributes and commands are new in spaCy v3.0.
| [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a pipeline's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). | | [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a pipeline's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). |
| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. | | [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. |
| [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). | | [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). |
| [`init vectors`](/api/cli#init-vectors) | Convert word vectors for use with spaCy. |
| [`init labels`](/api/cli#init-labels) | Generate JSON files for the labels in the data to speed up training. |
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). | | [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
| [`ray`](/api/cli#ray) | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. | | [`ray`](/api/cli#ray) | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. |

View File

@ -1,6 +1,11 @@
const autoprefixer = require('autoprefixer') const autoprefixer = require('autoprefixer')
const path = require('path') const path = require('path')
// https://florian.ec/blog/gatsby-build-netlify-segmentation-fault/
const sharp = require('sharp')
sharp.cache(false)
sharp.simd(false)
// Markdown plugins // Markdown plugins
const wrapSectionPlugin = require('./src/plugins/remark-wrap-section.js') const wrapSectionPlugin = require('./src/plugins/remark-wrap-section.js')
const customAttrsPlugin = require('./src/plugins/remark-custom-attrs.js') const customAttrsPlugin = require('./src/plugins/remark-custom-attrs.js')