This commit is contained in:
Matthew Honnibal 2020-10-01 23:07:53 +02:00
commit 75a1569908
59 changed files with 576 additions and 342 deletions

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
__version__ = "3.0.0a26"
__version__ = "3.0.0a28"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -7,6 +7,7 @@ import srsly
from .. import util
from ..training.initialize import init_nlp, convert_vectors
from ..language import Language
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu
@ -19,9 +20,9 @@ def init_vectors_cli(
output_dir: Path = Arg(..., help="Pipeline output directory"),
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
jsonl_loc: Optional[Path]=Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file"),
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
# fmt: on
):
"""Convert word vectors for use with spaCy. Will export an nlp object that
@ -32,12 +33,7 @@ def init_vectors_cli(
msg.info(f"Creating blank nlp object for language '{lang}'")
nlp = util.get_lang_class(lang)()
if jsonl_loc is not None:
lex_attrs = srsly.read_jsonl(jsonl_loc)
for attrs in lex_attrs:
if "settings" in attrs:
continue
lexeme = nlp.vocab[attrs["orth"]]
lexeme.set_attrs(**attrs)
update_lexemes(nlp, jsonl_loc)
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
nlp.to_disk(output_dir)
@ -48,6 +44,16 @@ def init_vectors_cli(
)
def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
# Mostly used for backwards-compatibility and may be removed in the future
lex_attrs = srsly.read_jsonl(jsonl_loc)
for attrs in lex_attrs:
if "settings" in attrs:
continue
lexeme = nlp.vocab[attrs["orth"]]
lexeme.set_attrs(**attrs)
@init_cli.command(
"nlp",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
@ -89,7 +95,7 @@ def init_labels_cli(
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
"""Generate a JSON file for labels in the data. This helps speed up the
"""Generate JSON files for the labels in the data. This helps speed up the
training process, since spaCy won't have to preprocess the data to
extract the labels."""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)

View File

@ -2,7 +2,6 @@
train = null
dev = null
vectors = null
vocab_data = null
init_tok2vec = null
[system]
@ -11,8 +10,13 @@ gpu_allocator = null
[nlp]
lang = null
# List of pipeline component names, in order. The names should correspond to
# components defined in the [components block]
pipeline = []
# Components that are loaded but disabled by default
disabled = []
# Optional callbacks to modify the nlp object before it's initialized, after
# it's created and after the pipeline has been set up
before_creation = null
after_creation = null
after_pipeline_creation = null
@ -20,6 +24,7 @@ after_pipeline_creation = null
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
# The pipeline components and their models
[components]
# Readers for corpora like dev and train.
@ -38,8 +43,7 @@ max_length = 0
limit = 0
# Apply some simply data augmentation, where we replace tokens with variations.
# This is especially useful for punctuation and case replacement, to help
# generalize beyond corpora that don't have smart-quotes, or only have smart
# quotes, etc.
# generalize beyond corpora that don't/only have smart quotes etc.
augmenter = null
[corpora.dev]
@ -53,6 +57,7 @@ gold_preproc = false
max_length = 0
# Limitation on number of training examples
limit = 0
# Optional callback for data augmentation
augmenter = null
# Training hyper-parameters and additional features.
@ -102,17 +107,18 @@ use_averages = false
eps = 1e-8
learn_rate = 0.001
# The 'initialize' step is run before training or pretraining. Components and
# the tokenizer can each define their own arguments via their .initialize
# methods that are populated by the config. This lets them gather resources like
# lookup tables and build label sets, construct vocabularies, etc.
# These settings are used when nlp.initialize() is called (typically before
# training or pretraining). Components and the tokenizer can each define their
# own arguments via their initialize methods that are populated by the config.
# This lets them gather data resources, build label sets etc.
[initialize]
vocab_data = ${paths.vocab_data}
lookups = null
vectors = ${paths.vectors}
# Extra resources for transfer-learning or pseudo-rehearsal
init_tok2vec = ${paths.init_tok2vec}
# Data and lookups for vocabulary
vocab_data = null
lookups = null
# Arguments passed to the tokenizer's initialize method
tokenizer = {}
# Arguments passed to the initialize methods of the components (keyed by component name)
# Arguments for initialize methods of the components (keyed by component)
components = {}

View File

@ -710,6 +710,9 @@ class Errors:
"options: {modes}")
E1012 = ("Entity spans and blocked/missing/outside spans should be "
"provided to doc.set_ents as lists of `Span` objects.")
E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
"token itself. To set the morph from this MorphAnalysis, set from "
"the string value with: `token.set_morph(str(other_morph))`.")
@add_codes

View File

@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class DanishDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES

View File

@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class GermanDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES

View File

@ -9,21 +9,9 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
from .lemmatizer import GreekLemmatizer
from ...lookups import Lookups
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class GreekDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES

View File

@ -4,21 +4,9 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class IndonesianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES

View File

@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class LuxembourgishDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS

View File

@ -3,21 +3,9 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class PortugueseDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES

View File

@ -7,21 +7,9 @@ from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer
from ...language import Language
from ...lookups import Lookups
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class RussianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS

View File

@ -2,21 +2,9 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class SerbianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS

View File

@ -1,21 +1,9 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class TamilDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS

View File

@ -10,13 +10,6 @@ DEFAULT_CONFIG = """
[nlp.tokenizer]
@tokenizers = "spacy.th.ThaiTokenizer"
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""

View File

@ -0,0 +1,25 @@
from typing import List, Union, Callable, Tuple
from thinc.types import Ints2d, Doc
from thinc.api import Model, registry
@registry.layers("spacy.FeatureExtractor.v1")
def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
return Model("extract_features", forward, attrs={"columns": columns})
def forward(model: Model[List[Doc], List[Ints2d]], docs, is_train: bool) -> Tuple[List[Ints2d], Callable]:
columns = model.attrs["columns"]
features: List[Ints2d] = []
for doc in docs:
if hasattr(doc, "to_array"):
attrs = doc.to_array(columns)
else:
attrs = doc.doc.to_array(columns)[doc.start : doc.end]
if attrs.ndim == 1:
attrs = attrs.reshape((attrs.shape[0], 1))
features.append(model.ops.asarray2i(attrs, dtype="uint64"))
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
return features, backprop

View File

@ -3,12 +3,13 @@ from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
from thinc.api import Relu, residual, expand_window, FeatureExtractor
from thinc.api import Relu, residual, expand_window
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
from ...util import registry
from ..extract_ngrams import extract_ngrams
from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor
@registry.architectures.register("spacy.TextCatCNN.v1")

View File

@ -1,16 +1,16 @@
from typing import Optional, List, Union
from thinc.api import chain, clone, concatenate, with_array, with_padded
from thinc.api import Model, noop, list2ragged, ragged2list
from thinc.api import FeatureExtractor, HashEmbed
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
from thinc.types import Floats2d
from thinc.api import chain, clone, concatenate, with_array, with_padded
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
from ...tokens import Doc
from ...util import registry
from ...ml import _character_embed
from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor
from ...pipeline.tok2vec import Tok2VecListener
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
@registry.architectures.register("spacy.Tok2VecListener.v1")
@ -98,7 +98,7 @@ def MultiHashEmbed(
attributes using hash embedding, concatenates the results, and passes it
through a feed-forward subnetwork to build a mixed representations.
The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have
The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
varying definitions depending on the Vocab of the Doc object passed in.
Vectors from pretrained static vectors can also be incorporated into the
concatenated representation.
@ -115,7 +115,7 @@ def MultiHashEmbed(
also_use_static_vectors (bool): Whether to also use static word vectors.
Requires a vectors table to be loaded in the Doc objects' vocab.
"""
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
seed = 7
def make_hash_embed(feature):
@ -123,7 +123,7 @@ def MultiHashEmbed(
seed += 1
return HashEmbed(
width,
rows if feature == NORM else rows // 2,
rows if feature == LOWER else rows // 2,
column=cols.index(feature),
seed=seed,
dropout=0.0,
@ -131,13 +131,13 @@ def MultiHashEmbed(
if also_embed_subwords:
embeddings = [
make_hash_embed(NORM),
make_hash_embed(LOWER),
make_hash_embed(PREFIX),
make_hash_embed(SUFFIX),
make_hash_embed(SHAPE),
]
else:
embeddings = [make_hash_embed(NORM)]
embeddings = [make_hash_embed(LOWER)]
concat_size = width * (len(embeddings) + also_use_static_vectors)
if also_use_static_vectors:
model = chain(
@ -180,13 +180,17 @@ def CharacterEmbed(
of being in an arbitrary position depending on the word length.
The characters are embedded in a embedding table with a given number of rows,
and the vectors concatenated. A hash-embedded vector of the NORM of the word is
and the vectors concatenated. A hash-embedded vector of the LOWER of the word is
also concatenated on, and the result is then passed through a feed-forward
network to construct a single vector to represent the information.
feature (int or str): An attribute to embed, to concatenate with the characters.
width (int): The width of the output vector and the feature embedding.
<<<<<<< HEAD
rows (int): The number of rows in the NORM hash embedding table.
=======
rows (int): The number of rows in the LOWER hash embedding table.
>>>>>>> 300e5a9928fd226dfddbf7d5c22558f696bfa1af
nM (int): The dimensionality of the character embeddings. Recommended values
are between 16 and 64.
nC (int): The number of UTF-8 bytes to embed per word. Recommended values

View File

@ -149,7 +149,7 @@ class Morphologizer(Tagger):
for example in get_examples():
for i, token in enumerate(example.reference):
pos = token.pos_
morph = token.morph_
morph = str(token.morph)
# create and add the combined morph+POS label
morph_dict = Morphology.feats_to_dict(morph)
if pos:
@ -167,7 +167,7 @@ class Morphologizer(Tagger):
gold_array = []
for i, token in enumerate(example.reference):
pos = token.pos_
morph = token.morph_
morph = str(token.morph)
morph_dict = Morphology.feats_to_dict(morph)
if pos:
morph_dict[self.POS_FEAT] = pos

View File

@ -268,6 +268,9 @@ class Tagger(Pipe):
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects..
nlp (Language): The current nlp object the component is part of.
labels: The labels to add to the component, typically generated by the
`init labels` command. If no labels are provided, the get_examples
callback is used to extract the labels from the data.
DOCS: https://nightly.spacy.io/api/tagger#initialize
"""

View File

@ -355,6 +355,9 @@ class TextCategorizer(Pipe):
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
labels: The labels to add to the component, typically generated by the
`init labels` command. If no labels are provided, the get_examples
callback is used to extract the labels from the data.
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
"""

View File

@ -46,9 +46,9 @@ def test_doc_array_morph(en_vocab):
words = ["Eat", "blue", "ham"]
morph = ["Feat=V", "Feat=J", "Feat=N"]
doc = Doc(en_vocab, words=words, morphs=morph)
assert morph[0] == doc[0].morph_
assert morph[1] == doc[1].morph_
assert morph[2] == doc[2].morph_
assert morph[0] == str(doc[0].morph)
assert morph[1] == str(doc[1].morph)
assert morph[2] == str(doc[2].morph)
feats_array = doc.to_array((ORTH, MORPH))
assert feats_array[0][1] == doc[0].morph.key

View File

@ -319,15 +319,13 @@ def test_doc_from_array_morph(en_vocab):
words = ["I", "live", "in", "New", "York", "."]
morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
# fmt: on
doc = Doc(en_vocab, words=words)
for i, morph in enumerate(morphs):
doc[i].morph_ = morph
doc = Doc(en_vocab, words=words, morphs=morphs)
attrs = [MORPH]
arr = doc.to_array(attrs)
new_doc = Doc(en_vocab, words=words)
new_doc.from_array(attrs, arr)
assert [t.morph_ for t in new_doc] == morphs
assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc]
assert [str(t.morph) for t in new_doc] == morphs
assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc]
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
@ -423,7 +421,7 @@ def test_has_annotation(en_vocab):
doc[0].tag_ = "A"
doc[0].pos_ = "X"
doc[0].morph_ = "Feat=Val"
doc[0].set_morph("Feat=Val")
doc[0].lemma_ = "a"
doc[0].dep_ = "dep"
doc[0].head = doc[1]
@ -435,7 +433,7 @@ def test_has_annotation(en_vocab):
doc[1].tag_ = "A"
doc[1].pos_ = "X"
doc[1].morph_ = ""
doc[1].set_morph("")
doc[1].lemma_ = "a"
doc[1].dep_ = "dep"
doc.ents = [Span(doc, 0, 2, label="HELLO")]
@ -533,5 +531,78 @@ def test_doc_ents_setter():
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
vocab = Vocab()
ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"]
doc = Doc(vocab, words=words, ents=ents)
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
def test_doc_morph_setter(en_tokenizer, de_tokenizer):
doc1 = en_tokenizer("a b")
doc1b = en_tokenizer("c d")
doc2 = de_tokenizer("a b")
# unset values can be copied
doc1[0].morph = doc1[1].morph
assert doc1[0].morph.key == 0
assert doc1[1].morph.key == 0
# morph values from the same vocab can be copied
doc1[0].set_morph("Feat=Val")
doc1[1].morph = doc1[0].morph
assert doc1[0].morph == doc1[1].morph
# ... also across docs
doc1b[0].morph = doc1[0].morph
assert doc1[0].morph == doc1b[0].morph
doc2[0].set_morph("Feat2=Val2")
# the morph value must come from the same vocab
with pytest.raises(ValueError):
doc1[0].morph = doc2[0].morph
def test_doc_init_iob():
"""Test ents validation/normalization in Doc.__init__"""
words = ["a", "b", "c", "d", "e"]
ents = ["O"] * len(words)
doc = Doc(Vocab(), words=words, ents=ents)
assert doc.ents == ()
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-PERSON"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 2
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 3
# None is missing
ents = ["B-PERSON", "I-PERSON", "O", None, "I-GPE"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 2
# empty tag is missing
ents = ["", "B-PERSON", "O", "B-PERSON", "I-PERSON"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 2
# invalid IOB
ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)
# no dash
ents = ["OPERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)
# no ent type
ents = ["O", "B-", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)
# not strings or None
ents = [0, "B-", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)

View File

@ -4,13 +4,13 @@ import pytest
@pytest.fixture
def i_has(en_tokenizer):
doc = en_tokenizer("I has")
doc[0].morph_ = {"PronType": "prs"}
doc[1].morph_ = {
doc[0].set_morph({"PronType": "prs"})
doc[1].set_morph({
"VerbForm": "fin",
"Tense": "pres",
"Number": "sing",
"Person": "three",
}
})
return doc
@ -47,20 +47,20 @@ def test_morph_get(i_has):
def test_morph_set(i_has):
assert i_has[0].morph.get("PronType") == ["prs"]
# set by string
i_has[0].morph_ = "PronType=unk"
i_has[0].set_morph("PronType=unk")
assert i_has[0].morph.get("PronType") == ["unk"]
# set by string, fields are alphabetized
i_has[0].morph_ = "PronType=123|NounType=unk"
assert i_has[0].morph_ == "NounType=unk|PronType=123"
i_has[0].set_morph("PronType=123|NounType=unk")
assert str(i_has[0].morph) == "NounType=unk|PronType=123"
# set by dict
i_has[0].morph_ = {"AType": "123", "BType": "unk"}
assert i_has[0].morph_ == "AType=123|BType=unk"
i_has[0].set_morph({"AType": "123", "BType": "unk"})
assert str(i_has[0].morph) == "AType=123|BType=unk"
# set by string with multiple values, fields and values are alphabetized
i_has[0].morph_ = "BType=c|AType=b,a"
assert i_has[0].morph_ == "AType=a,b|BType=c"
i_has[0].set_morph("BType=c|AType=b,a")
assert str(i_has[0].morph) == "AType=a,b|BType=c"
# set by dict with multiple values, fields and values are alphabetized
i_has[0].morph_ = {"AType": "b,a", "BType": "c"}
assert i_has[0].morph_ == "AType=a,b|BType=c"
i_has[0].set_morph({"AType": "b,a", "BType": "c"})
assert str(i_has[0].morph) == "AType=a,b|BType=c"
def test_morph_str(i_has):
@ -72,25 +72,25 @@ def test_morph_property(tokenizer):
doc = tokenizer("a dog")
# set through token.morph_
doc[0].morph_ = "PronType=prs"
assert doc[0].morph_ == "PronType=prs"
doc[0].set_morph("PronType=prs")
assert str(doc[0].morph) == "PronType=prs"
assert doc.to_array(["MORPH"])[0] != 0
# unset with token.morph
doc[0].morph = 0
doc[0].set_morph(0)
assert doc.to_array(["MORPH"])[0] == 0
# empty morph is equivalent to "_"
doc[0].morph_ = ""
assert doc[0].morph_ == ""
doc[0].set_morph("")
assert str(doc[0].morph) == ""
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
# "_" morph is also equivalent to empty morph
doc[0].morph_ = "_"
assert doc[0].morph_ == ""
doc[0].set_morph("_")
assert str(doc[0].morph) == ""
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
# set through existing hash with token.morph
tokenizer.vocab.strings.add("Feat=Val")
doc[0].morph = tokenizer.vocab.strings.add("Feat=Val")
assert doc[0].morph_ == "Feat=Val"
doc[0].set_morph(tokenizer.vocab.strings.add("Feat=Val"))
assert str(doc[0].morph) == "Feat=Val"

View File

@ -21,11 +21,11 @@ def test_doc_retokenize_merge(en_tokenizer):
assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == "NAMED"
assert doc[4].morph_ == "Number=Plur"
assert str(doc[4].morph) == "Number=Plur"
assert doc[5].text == "all night"
assert doc[5].text_with_ws == "all night"
assert doc[5].tag_ == "NAMED"
assert doc[5].morph_ == "Number=Plur"
assert str(doc[5].morph) == "Number=Plur"
def test_doc_retokenize_merge_children(en_tokenizer):
@ -201,6 +201,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
ents = ["O"] * len(heads)
ents[0] = "B-PERSON"
ents[1] = "I-PERSON"
ents[10] = "B-GPE"
ents[13] = "B-PERSON"
ents[14] = "I-PERSON"
# fmt: on
tokens = en_tokenizer(text)
doc = Doc(
@ -269,7 +275,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
# if there is a parse, span.root provides default values
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)]
ents = ["O"] * len(words)
ents[3] = "B-ent-de"
ents[4] = "I-ent-de"
ents[5] = "B-ent-fg"
ents[6] = "I-ent-fg"
deps = ["dep"] * len(words)
en_vocab.strings.add("ent-de")
en_vocab.strings.add("ent-fg")
@ -292,7 +302,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
# check that B is preserved if span[start] is B
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
ents = [("ent-de", 3, 5), ("ent-de", 5, 7)]
ents = ["O"] * len(words)
ents[3] = "B-ent-de"
ents[4] = "I-ent-de"
ents[5] = "B-ent-de"
ents[6] = "I-ent-de"
deps = ["dep"] * len(words)
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
with doc.retokenize() as retokenizer:

View File

@ -27,11 +27,11 @@ def test_doc_retokenize_split(en_vocab):
assert doc[0].text == "Los"
assert doc[0].head.text == "Angeles"
assert doc[0].idx == 0
assert doc[0].morph_ == "Number=Sing"
assert str(doc[0].morph) == "Number=Sing"
assert doc[1].idx == 3
assert doc[1].text == "Angeles"
assert doc[1].head.text == "start"
assert doc[1].morph_ == "Number=Sing"
assert str(doc[1].morph) == "Number=Sing"
assert doc[2].text == "start"
assert doc[2].head.text == "."
assert doc[3].text == "."

View File

@ -9,7 +9,7 @@ def doc(en_vocab):
tags = ["VBP", "NN", "NN"]
heads = [0, 0, 0]
deps = ["ROOT", "dobj", "dobj"]
ents = [("ORG", 1, 2)]
ents = ["O", "B-ORG", "O"]
return Doc(
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
)

View File

@ -236,13 +236,13 @@ def test_matcher_subset_value_operator(en_vocab):
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val"
doc[0].set_morph("Feat=Val")
assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val|Feat2=Val2"
doc[0].set_morph("Feat=Val|Feat2=Val2")
assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
assert len(matcher(doc)) == 2
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
assert len(matcher(doc)) == 2
# IS_SUBSET acts like "IN" for attrs other than MORPH
@ -268,11 +268,11 @@ def test_matcher_superset_value_operator(en_vocab):
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat=Val|Feat2=Val2"
doc[0].set_morph("Feat=Val|Feat2=Val2")
assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
assert len(matcher(doc)) == 1
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
assert len(matcher(doc)) == 1
# IS_SUPERSET with more than one value only matches for MORPH
@ -310,9 +310,9 @@ def test_matcher_morph_handling(en_vocab):
doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat2=Val2|Feat1=Val1"
doc[0].set_morph("Feat2=Val2|Feat1=Val1")
assert len(matcher(doc)) == 2
doc[0].morph_ = "Feat1=Val1|Feat2=Val2"
doc[0].set_morph("Feat1=Val1|Feat2=Val2")
assert len(matcher(doc)) == 2
# multiple values are split
@ -324,9 +324,9 @@ def test_matcher_morph_handling(en_vocab):
doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1"
doc[0].set_morph("Feat2=Val2,Val3|Feat1=Val1")
assert len(matcher(doc)) == 1
doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2"
doc[0].set_morph("Feat1=Val1,Val3|Feat2=Val2")
assert len(matcher(doc)) == 2
@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab):
doc2 = Doc(en_vocab, words=["Test"])
doc2[0].tag_ = "TAG"
doc2[0].pos_ = "X"
doc2[0].morph_ = "Feat=Val"
doc2[0].set_morph("Feat=Val")
doc2[0].lemma_ = "LEMMA"
doc3 = Doc(en_vocab, words=["Test"])
# DEP requires DEP

View File

@ -190,7 +190,7 @@ def test_phrase_matcher_validation(en_vocab):
doc2 = Doc(en_vocab, words=["Test"])
doc2[0].tag_ = "TAG"
doc2[0].pos_ = "X"
doc2[0].morph_ = "Feat=Val"
doc2[0].set_morph("Feat=Val")
doc3 = Doc(en_vocab, words=["Test"])
matcher = PhraseMatcher(en_vocab, validate=True)
with pytest.warns(UserWarning):
@ -217,7 +217,7 @@ def test_attr_pipeline_checks(en_vocab):
doc2 = Doc(en_vocab, words=["Test"])
doc2[0].tag_ = "TAG"
doc2[0].pos_ = "X"
doc2[0].morph_ = "Feat=Val"
doc2[0].set_morph("Feat=Val")
doc2[0].lemma_ = "LEMMA"
doc3 = Doc(en_vocab, words=["Test"])
# DEP requires DEP

View File

@ -339,7 +339,6 @@ def test_ner_warns_no_lookups(caplog):
nlp.vocab.lookups = Lookups()
assert not len(nlp.vocab.lookups)
nlp.add_pipe("ner")
nlp.config["initialize"]["lookups"] = None
with caplog.at_level(logging.DEBUG):
nlp.initialize()
assert "W033" in caplog.text

View File

@ -69,9 +69,9 @@ def test_attributeruler_init(nlp, pattern_dicts):
a.add(**p)
doc = nlp("This is a test.")
assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur"
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing"
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH")
@ -81,9 +81,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
doc = nlp("This is a test.")
assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur"
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing"
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH")
nlp.remove_pipe("attribute_ruler")
@ -94,9 +94,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
)
doc = nlp("This is a test.")
assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur"
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing"
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH")
@ -106,9 +106,9 @@ def test_attributeruler_score(nlp, pattern_dicts):
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
doc = nlp("This is a test.")
assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur"
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing"
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
dev_examples = [
Example.from_dict(
@ -150,10 +150,10 @@ def test_attributeruler_tag_map(nlp, tag_map):
for i in range(len(doc)):
if i == 4:
assert doc[i].pos_ == "PUNCT"
assert doc[i].morph_ == "PunctType=peri"
assert str(doc[i].morph) == "PunctType=peri"
else:
assert doc[i].pos_ == ""
assert doc[i].morph_ == ""
assert str(doc[i].morph) == ""
def test_attributeruler_morph_rules(nlp, morph_rules):
@ -168,11 +168,11 @@ def test_attributeruler_morph_rules(nlp, morph_rules):
for i in range(len(doc)):
if i != 2:
assert doc[i].pos_ == ""
assert doc[i].morph_ == ""
assert str(doc[i].morph) == ""
else:
assert doc[2].pos_ == "DET"
assert doc[2].lemma_ == "a"
assert doc[2].morph_ == "Case=Nom"
assert str(doc[2].morph) == "Case=Nom"
def test_attributeruler_indices(nlp):
@ -194,14 +194,14 @@ def test_attributeruler_indices(nlp):
for i in range(len(doc)):
if i == 1:
assert doc[i].lemma_ == "was"
assert doc[i].morph_ == "Case=Nom|Number=Sing"
assert str(doc[i].morph) == "Case=Nom|Number=Sing"
elif i == 2:
assert doc[i].lemma_ == "the"
assert doc[i].morph_ == "Case=Nom|Number=Plur"
assert str(doc[i].morph) == "Case=Nom|Number=Plur"
elif i == 3:
assert doc[i].lemma_ == "cat"
else:
assert doc[i].morph_ == ""
assert str(doc[i].morph) == ""
# raises an error when trying to modify a token outside of the match
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
with pytest.raises(ValueError):

View File

@ -91,7 +91,7 @@ def test_overfitting_IO():
doc = nlp(test_text)
gold_morphs = ["Feat=N", "Feat=V", "", ""]
gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
assert [t.morph_ for t in doc] == gold_morphs
assert [str(t.morph) for t in doc] == gold_morphs
assert [t.pos_ for t in doc] == gold_pos_tags
# Also test the results are still the same after IO
@ -99,5 +99,5 @@ def test_overfitting_IO():
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2(test_text)
assert [t.morph_ for t in doc2] == gold_morphs
assert [str(t.morph) for t in doc2] == gold_morphs
assert [t.pos_ for t in doc2] == gold_pos_tags

View File

@ -59,7 +59,7 @@ def test_issue3012(en_vocab):
words = ["This", "is", "10", "%", "."]
tags = ["DT", "VBZ", "CD", "NN", "."]
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
ents = [("PERCENT", 2, 4)]
ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
assert doc.has_annotation("TAG")
expected = ("10", "NUM", "CD", "PERCENT")

View File

@ -76,7 +76,7 @@ def tagged_doc():
for i in range(len(tags)):
doc[i].tag_ = tags[i]
doc[i].pos_ = pos[i]
doc[i].morph_ = morphs[i]
doc[i].set_morph(morphs[i])
if i > 0:
doc[i].is_sent_start = False
return doc
@ -184,7 +184,7 @@ def test_ner_per_type(en_vocab):
doc = Doc(
en_vocab,
words=input_.split(" "),
ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
ents=["B-CARDINAL", "O", "B-CARDINAL"],
)
entities = offsets_to_biluo_tags(doc, annot["entities"])
example = Example.from_dict(doc, {"entities": entities})
@ -209,7 +209,7 @@ def test_ner_per_type(en_vocab):
doc = Doc(
en_vocab,
words=input_.split(" "),
ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
ents=["B-ORG", "O", "O", "O", "O", "B-GPE", "B-ORG", "O", "O", "O"],
)
entities = offsets_to_biluo_tags(doc, annot["entities"])
example = Example.from_dict(doc, {"entities": entities})
@ -242,7 +242,7 @@ def test_tag_score(tagged_doc):
gold = {
"tags": [t.tag_ for t in tagged_doc],
"pos": [t.pos_ for t in tagged_doc],
"morphs": [t.morph_ for t in tagged_doc],
"morphs": [str(t.morph) for t in tagged_doc],
"sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc],
}
example = Example.from_dict(tagged_doc, gold)
@ -259,7 +259,7 @@ def test_tag_score(tagged_doc):
tags[0] = "NN"
pos = [t.pos_ for t in tagged_doc]
pos[1] = "X"
morphs = [t.morph_ for t in tagged_doc]
morphs = [str(t.morph) for t in tagged_doc]
morphs[1] = "Number=sing"
morphs[2] = "Number=plur"
gold = {

View File

@ -113,7 +113,7 @@ def test_Example_from_dict_with_morphology(annots):
predicted = Doc(vocab, words=annots["words"])
example = Example.from_dict(predicted, annots)
for i, token in enumerate(example.reference):
assert token.morph_ == annots["morphs"][i]
assert str(token.morph) == annots["morphs"][i]
@pytest.mark.parametrize(

View File

@ -30,7 +30,12 @@ def doc(en_vocab):
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
ents = (("PERSON", 0, 2), ("LOC", 5, 7), ("GPE", 8, 9))
ents = ["O"] * len(words)
ents[0] = "B-PERSON"
ents[1] = "I-PERSON"
ents[5] = "B-LOC"
ents[6] = "I-LOC"
ents[8] = "B-GPE"
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
# fmt: on
doc = Doc(
@ -455,7 +460,7 @@ def test_roundtrip_docs_to_docbin(doc):
idx = [t.idx for t in doc]
tags = [t.tag_ for t in doc]
pos = [t.pos_ for t in doc]
morphs = [t.morph_ for t in doc]
morphs = [str(t.morph) for t in doc]
lemmas = [t.lemma_ for t in doc]
deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc]
@ -477,7 +482,7 @@ def test_roundtrip_docs_to_docbin(doc):
assert idx == [t.idx for t in reloaded_example.reference]
assert tags == [t.tag_ for t in reloaded_example.reference]
assert pos == [t.pos_ for t in reloaded_example.reference]
assert morphs == [t.morph_ for t in reloaded_example.reference]
assert morphs == [str(t.morph) for t in reloaded_example.reference]
assert lemmas == [t.lemma_ for t in reloaded_example.reference]
assert deps == [t.dep_ for t in reloaded_example.reference]
assert heads == [t.head.i for t in reloaded_example.reference]

View File

@ -101,7 +101,7 @@ class DocBin:
self.strings.add(token.text)
self.strings.add(token.tag_)
self.strings.add(token.lemma_)
self.strings.add(token.morph_)
self.strings.add(str(token.morph))
self.strings.add(token.dep_)
self.strings.add(token.ent_type_)
self.strings.add(token.ent_kb_id_)

View File

@ -213,8 +213,9 @@ cdef class Doc:
sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
the same length as words, to assign as token.is_sent_start. Will be
overridden by heads if heads is provided. Defaults to None.
ents (Optional[List[Tuple[Union[str, int], int, int]]]): A list of
(label, start, end) tuples to assign as doc.ents. Defaults to None.
ents (Optional[List[str]]): A list of unicode strings, of the same
length as words, as IOB tags to assign as token.ent_iob and
token.ent_type. Defaults to None.
DOCS: https://nightly.spacy.io/api/doc#init
"""
@ -275,16 +276,55 @@ cdef class Doc:
sent_starts[i] = -1
elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
sent_starts[i] = 0
ent_iobs = None
ent_types = None
if ents is not None:
iob_strings = Token.iob_strings()
# make valid IOB2 out of IOB1 or IOB2
for i, ent in enumerate(ents):
if ent is "":
ents[i] = None
elif ent is not None and not isinstance(ent, str):
raise ValueError(Errors.E177.format(tag=ent))
if i < len(ents) - 1:
# OI -> OB
if (ent is None or ent.startswith("O")) and \
(ents[i+1] is not None and ents[i+1].startswith("I")):
ents[i+1] = "B" + ents[i+1][1:]
# B-TYPE1 I-TYPE2 or I-TYPE1 I-TYPE2 -> B/I-TYPE1 B-TYPE2
if ent is not None and ents[i+1] is not None and \
(ent.startswith("B") or ent.startswith("I")) and \
ents[i+1].startswith("I") and \
ent[1:] != ents[i+1][1:]:
ents[i+1] = "B" + ents[i+1][1:]
ent_iobs = []
ent_types = []
for ent in ents:
if ent is None:
ent_iobs.append(iob_strings.index(""))
ent_types.append("")
elif ent == "O":
ent_iobs.append(iob_strings.index(ent))
ent_types.append("")
else:
if len(ent) < 3 or ent[1] != "-":
raise ValueError(Errors.E177.format(tag=ent))
ent_iob, ent_type = ent.split("-", 1)
if ent_iob not in iob_strings:
raise ValueError(Errors.E177.format(tag=ent))
ent_iob = iob_strings.index(ent_iob)
ent_iobs.append(ent_iob)
ent_types.append(ent_type)
headings = []
values = []
annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts]
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START]
annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts, ent_iobs, ent_types]
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START, ENT_IOB, ENT_TYPE]
for a, annot in enumerate(annotations):
if annot is not None:
if len(annot) != len(words):
raise ValueError(Errors.E189)
headings.append(possible_headings[a])
if annot is not heads and annot is not sent_starts:
if annot is not heads and annot is not sent_starts and annot is not ent_iobs:
values.extend(annot)
for value in values:
self.vocab.strings.add(value)
@ -296,7 +336,7 @@ cdef class Doc:
j = 0
for annot in annotations:
if annot:
if annot is heads or annot is sent_starts:
if annot is heads or annot is sent_starts or annot is ent_iobs:
for i in range(len(words)):
if attrs.ndim == 1:
attrs[i] = annot[i]
@ -317,8 +357,6 @@ cdef class Doc:
attrs[i, j] = self.vocab.strings[annot[i]]
j += 1
self.from_array(headings, attrs)
if ents is not None:
self.ents = ents
@property
def _(self):
@ -1210,7 +1248,7 @@ cdef class Doc:
for token in self:
strings.add(token.tag_)
strings.add(token.lemma_)
strings.add(token.morph_)
strings.add(str(token.morph))
strings.add(token.dep_)
strings.add(token.ent_type_)
strings.add(token.ent_kb_id_)

View File

@ -215,20 +215,20 @@ cdef class Token:
def __get__(self):
return MorphAnalysis.from_id(self.vocab, self.c.morph)
def __set__(self, attr_t morph):
if morph == 0:
self.c.morph = morph
elif morph in self.vocab.strings:
self.morph_ = self.vocab.strings[morph]
def __set__(self, MorphAnalysis morph):
# Check that the morph has the same vocab
if self.vocab != morph.vocab:
raise ValueError(Errors.E1013)
self.c.morph = morph.c.key
def set_morph(self, features):
cdef hash_t key
if features is 0:
self.c.morph = 0
else:
raise ValueError(Errors.E1009.format(val=morph))
property morph_:
def __get__(self):
return str(MorphAnalysis.from_id(self.vocab, self.c.morph))
def __set__(self, features):
cdef hash_t key = self.vocab.morphology.add(features)
if isinstance(features, int):
features = self.vocab.strings[features]
key = self.vocab.morphology.add(features)
self.c.morph = key
@property

View File

@ -207,6 +207,7 @@ def conllu_sentence_to_doc(
pos=poses,
deps=deps,
lemmas=lemmas,
morphs=morphs,
heads=heads,
)
for i in range(len(doc)):

View File

@ -1,4 +1,4 @@
from collections import Iterable as IterableInstance
from collections.abc import Iterable as IterableInstance
import warnings
import numpy
from murmurhash.mrmr cimport hash64
@ -226,7 +226,7 @@ cdef class Example:
"TAG": [t.tag_ for t in self.reference],
"LEMMA": [t.lemma_ for t in self.reference],
"POS": [t.pos_ for t in self.reference],
"MORPH": [t.morph_ for t in self.reference],
"MORPH": [str(t.morph) for t in self.reference],
"HEAD": [t.head.i for t in self.reference],
"DEP": [t.dep_ for t in self.reference],
"SENT_START": [int(bool(t.is_sent_start)) for t in self.reference]

View File

@ -44,7 +44,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
if include_annotation["POS"]:
json_token["pos"] = token.pos_
if include_annotation["MORPH"]:
json_token["morph"] = token.morph_
json_token["morph"] = str(token.morph)
if include_annotation["LEMMA"]:
json_token["lemma"] = token.lemma_
if include_annotation["DEP"]:

View File

@ -144,9 +144,9 @@ argument that connects to the shared `tok2vec` component in the pipeline.
Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it through
a feed-forward subnetwork to build mixed representations. The features used are
the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying definitions
depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained
static vectors can also be incorporated into the concatenated representation.
the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a
[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static
vectors can also be incorporated into the concatenated representation.
| Name | Description |
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -291,6 +291,24 @@ on [static vectors](/usage/embeddings-transformers#static-vectors) for details.
| `key_attr` | Defaults to `"ORTH"`. ~~str~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ |
### spacy.FeatureExtractor.v1 {#FeatureExtractor}
> #### Example config
>
> ```ini
> [model]
> @architectures = "spacy.FeatureExtractor.v1"
> columns = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
> ```
Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list
of feature names to extract, which should refer to token attributes.
| Name |  Description |
| ----------- | ------------------------------------------------------------------------ |
| `columns` | The token attributes to extract. ~~List[Union[int, str]]~~ |
| **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
The following architectures are provided by the package

View File

@ -186,15 +186,14 @@ This functionality was previously available as part of the command `init-model`.
</Infobox>
```cli
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--lexemes-jsonl] [--verbose]
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose]
```
| Name | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
| `--lexemes-jsonl`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~ |
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
@ -202,6 +201,39 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
### init labels {#init-labels new="3" tag="command"}
Generate JSON files for the labels in the data. This helps speed up the training
process, since spaCy won't have to preprocess the data to extract the labels.
After generating the labels, you can provide them to components that accept a
`labels` argument on initialization via the
[`[initialize]`](/api/data-formats#config-initialize) block of your config.
> #### Example config
>
> ```ini
> [initialize.components.ner]
>
> [initialize.components.ner.labels]
> @readers = "spacy.read_labels.v1"
> path = "corpus/labels/ner.json
> ```
```cli
$ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [--gpu-id] [overrides]
```
| Name | Description |
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **CREATES** | The final trained pipeline and the best trained pipeline. |
## convert {#convert tag="command"}
Convert files into spaCy's

View File

@ -238,8 +238,6 @@ without requiring them at runtime when you load the trained pipeline back in.
> data_path = "/path/to/component_data"
> ```
<!-- TODO: -->
| Name | Description |
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `components` | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~ |
@ -454,15 +452,20 @@ example = Example.from_dict(doc, gold_dict)
## Lexical data for vocabulary {#vocab-jsonl new="2"}
To populate a pipeline's vocabulary, you can use the
[`spacy init vectors`](/api/cli#init-vectors) command and load in a
[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one
lexical entry per line via the `--jsonl-loc` option. The first line defines the
language and vocabulary settings. All other lines are expected to be JSON
objects describing an individual lexeme. The lexical attributes will be then set
as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab`
command outputs a ready-to-use spaCy pipeline with a `Vocab` containing the
lexical data.
This data file can be provided via the `vocab_data` setting in the
`[initialize]` block of the training config to pre-define the lexical data to
initialize the `nlp` object's vocabulary with. The file should contain one
lexical entry per line. The first line defines the language and vocabulary
settings. All other lines are expected to be JSON objects describing an
individual lexeme. The lexical attributes will be then set as attributes on
spaCy's [`Lexeme`](/api/lexeme#attributes) object.
> #### Example config
>
> ```ini
> [initialize]
> vocab_data = "/path/to/vocab-data.jsonl"
> ```
```python
### First line

View File

@ -21,8 +21,9 @@ non-projective parses.
The parser is trained using an **imitation learning objective**. It follows the
actions predicted by the current weights, and at each state, determines which
actions are compatible with the optimal parse that could be reached from the
current state. The weights are updated such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note
that more than one action may be optimal for a given state.
current state. The weights are updated such that the scores assigned to the set
of optimal actions is increased, while scores assigned to other actions are
decreased. Note that more than one action may be optimal for a given state.
## Config and implementation {#config}
@ -139,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## DependencyParser.initialize {#initialize tag="method"}
## DependencyParser.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are
@ -148,7 +149,10 @@ training data or a representative sample. Initialization includes validating the
network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize).
by [`Language.initialize`](/api/language#initialize) and lets you customize
arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
config.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
@ -162,12 +166,22 @@ This method was previously called `begin_training`.
> parser = nlp.add_pipe("parser")
> parser.initialize(lambda: [], nlp=nlp)
> ```
>
> ```ini
> ### config.cfg
> [initialize.components.parser]
>
> [initialize.components.parser.labels]
> @readers = "spacy.read_labels.v1"
> path = "corpus/labels/parser.json
> ```
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
## DependencyParser.predict {#predict tag="method"}

View File

@ -32,7 +32,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
> ```
| Name | Description |
| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
@ -45,7 +45,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ |
| `ents` <Tag variant="new">3</Tag> | A list of `(label, start, end)` tuples to assign as `doc.ents`. Note that the `start` and `end` indices here refer to the token indices. Defaults to `None`. ~~Optional[List[Tuple[Union[str, int], int, int]]]~~ |
| `ents` <Tag variant="new">3</Tag> | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
@ -503,7 +503,9 @@ invalidated, although they may accidentally continue to work.
Mark a span for merging. The `attrs` will be applied to the resulting token (if
they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
underlying lexeme (if they're context-independent lexical attributes like
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute names to values.
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided
using the `"_"` key and specifying a dictionary that maps attribute names to
values.
> #### Example
>

View File

@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## EntityLinker.initialize {#initialize tag="method"}
## EntityLinker.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are

View File

@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## EntityRecognizer.initialize {#initialize tag="method"}
## EntityRecognizer.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are
@ -138,7 +138,10 @@ training data or a representative sample. Initialization includes validating the
network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize).
by [`Language.initialize`](/api/language#initialize) and lets you customize
arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
config.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
@ -152,12 +155,22 @@ This method was previously called `begin_training`.
> ner = nlp.add_pipe("ner")
> ner.initialize(lambda: [], nlp=nlp)
> ```
>
> ```ini
> ### config.cfg
> [initialize.components.ner]
>
> [initialize.components.ner.labels]
> @readers = "spacy.read_labels.v1"
> path = "corpus/labels/ner.json
> ```
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
## EntityRecognizer.predict {#predict tag="method"}

View File

@ -202,7 +202,7 @@ more efficient than processing texts one-by-one.
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
## Language.initialize {#initialize tag="method"}
## Language.initialize {#initialize tag="method" new="3"}
Initialize the pipeline for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the

View File

@ -126,7 +126,10 @@ training data or a representative sample. Initialization includes validating the
network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize).
by [`Language.initialize`](/api/language#initialize) and lets you customize
arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
config.
> #### Example
>
@ -134,12 +137,22 @@ by [`Language.initialize`](/api/language#initialize).
> morphologizer = nlp.add_pipe("morphologizer")
> morphologizer.initialize(lambda: [], nlp=nlp)
> ```
>
> ```ini
> ### config.cfg
> [initialize.components.morphologizer]
>
> [initialize.components.morphologizer.labels]
> @readers = "spacy.read_labels.v1"
> path = "corpus/labels/morphologizer.json
> ```
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
## Morphologizer.predict {#predict tag="method"}

View File

@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## Pipe.initialize {#initialize tag="method"}
## Pipe.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are

View File

@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## Tagger.initialize {#initialize tag="method"}
## Tagger.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are
@ -121,7 +121,10 @@ training data or a representative sample. Initialization includes validating the
network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize).
by [`Language.initialize`](/api/language#initialize) and lets you customize
arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
config.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
@ -135,12 +138,22 @@ This method was previously called `begin_training`.
> tagger = nlp.add_pipe("tagger")
> tagger.initialize(lambda: [], nlp=nlp)
> ```
>
> ```ini
> ### config.cfg
> [initialize.components.tagger]
>
> [initialize.components.tagger.labels]
> @readers = "spacy.read_labels.v1"
> path = "corpus/labels/tagger.json
> ```
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[list]~~ |
## Tagger.predict {#predict tag="method"}

View File

@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## TextCategorizer.initialize {#initialize tag="method"}
## TextCategorizer.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are
@ -134,7 +134,10 @@ training data or a representative sample. Initialization includes validating the
network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize).
by [`Language.initialize`](/api/language#initialize) and lets you customize
arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
config.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
@ -148,12 +151,22 @@ This method was previously called `begin_training`.
> textcat = nlp.add_pipe("textcat")
> textcat.initialize(lambda: [], nlp=nlp)
> ```
>
> ```ini
> ### config.cfg
> [initialize.components.textcat]
>
> [initialize.components.textcat.labels]
> @readers = "spacy.read_labels.v1"
> path = "corpus/labels/textcat.json
> ```
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
## TextCategorizer.predict {#predict tag="method"}

View File

@ -538,6 +538,32 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
| **CREATES** | The corpus reader. ~~JsonlTexts~~ |
### spacy.read_labels.v1 {#read_labels tag="registered function"}
Read a JSON-formatted labels file generated with
[`init labels`](/api/cli#init-labels). Typically used in the
[`[initialize]`](/api/data-formats#config-initialize) block of the training
config to speed up the model initialization process and provide pre-generated
label sets.
> #### Example config
>
> ```ini
> [initialize.components]
>
> [initialize.components.ner]
>
> [initialize.components.ner.labels]
> @readers = "spacy.read_labels.v1"
> path = "corpus/labels/ner.json"
> ```
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~ |
| `require` | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
| **CREATES** | The |
## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
A data batcher implements a batching strategy that essentially turns a stream of

View File

@ -585,8 +585,9 @@ vectors, but combines them via summation with a smaller table of learned
embeddings.
```python
from thinc.api import add, chain, remap_ids, Embed, FeatureExtractor
from thinc.api import add, chain, remap_ids, Embed
from spacy.ml.staticvectors import StaticVectors
from spacy.ml.featureextractor import FeatureExtractor
from spacy.util import registry
@registry.architectures("my_example.MyEmbedding.v1")

View File

@ -204,7 +204,19 @@ initialize it.
![Illustration of pipeline lifecycle](../images/lifecycle.svg)
<!-- TODO: explain lifecycle and initialization -->
At runtime spaCy will only use the `[nlp]` and `[components]` blocks of the
config and load all data, including tokenization rules, model weights and other
resources from the pipeline directory. The `[training]` block contains the
settings for training the model and is only used during training. Similarly, the
`[initialize]` block defines how the initial `nlp` object should be set up
before training and whether it should be initialized with vectors or pretrained
tok2vec weights, or any other data needed by the components.
The initialization settings are only loaded and used when
[`nlp.initialize`](/api/language#initialize) is called (typically right before
training). This allows you to set up your pipeline using local data resources
and custom functions, and preserve the information in your config but without
requiring it to be available at runtime
### Overwriting config settings on the command line {#config-overrides}
@ -803,6 +815,10 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
return create_model(output_width)
```
<!-- TODO:
### Customizing the initialization {#initialization}
-->
## Data utilities {#data}
spaCy includes various features and utilities to make it easy to train models
@ -853,7 +869,7 @@ nlp = spacy.blank("en")
docbin = DocBin(nlp.vocab)
words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
spaces = [True, True, True, True, True, True, True, False]
ents = [("ORG", 0, 1), ("GPE", 5, 6)]
ents = ["B-ORG", "O", "O", "O", "O", "B-GPE", "O", "O"]
doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
docbin.add(doc)
docbin.to_disk("./train.spacy")

View File

@ -104,7 +104,6 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
>
> ```ini
> [training]
> vectors = null
> accumulate_gradient = 3
>
> [training.optimizer]
@ -430,6 +429,8 @@ The following methods, attributes and commands are new in spaCy v3.0.
| [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a pipeline's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). |
| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. |
| [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). |
| [`init vectors`](/api/cli#init-vectors) | Convert word vectors for use with spaCy. |
| [`init labels`](/api/cli#init-labels) | Generate JSON files for the labels in the data to speed up training. |
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
| [`ray`](/api/cli#ray) | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. |

View File

@ -1,6 +1,11 @@
const autoprefixer = require('autoprefixer')
const path = require('path')
// https://florian.ec/blog/gatsby-build-netlify-segmentation-fault/
const sharp = require('sharp')
sharp.cache(false)
sharp.simd(false)
// Markdown plugins
const wrapSectionPlugin = require('./src/plugins/remark-wrap-section.js')
const customAttrsPlugin = require('./src/plugins/remark-custom-attrs.js')