mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Merge
This commit is contained in:
commit
75a1569908
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a26"
|
||||
__version__ = "3.0.0a28"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -7,6 +7,7 @@ import srsly
|
|||
|
||||
from .. import util
|
||||
from ..training.initialize import init_nlp, convert_vectors
|
||||
from ..language import Language
|
||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code, setup_gpu
|
||||
|
||||
|
@ -19,9 +20,9 @@ def init_vectors_cli(
|
|||
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
||||
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
||||
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||
jsonl_loc: Optional[Path]=Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file"),
|
||||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||
# fmt: on
|
||||
):
|
||||
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
||||
|
@ -32,12 +33,7 @@ def init_vectors_cli(
|
|||
msg.info(f"Creating blank nlp object for language '{lang}'")
|
||||
nlp = util.get_lang_class(lang)()
|
||||
if jsonl_loc is not None:
|
||||
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
||||
for attrs in lex_attrs:
|
||||
if "settings" in attrs:
|
||||
continue
|
||||
lexeme = nlp.vocab[attrs["orth"]]
|
||||
lexeme.set_attrs(**attrs)
|
||||
update_lexemes(nlp, jsonl_loc)
|
||||
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||
nlp.to_disk(output_dir)
|
||||
|
@ -48,6 +44,16 @@ def init_vectors_cli(
|
|||
)
|
||||
|
||||
|
||||
def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
|
||||
# Mostly used for backwards-compatibility and may be removed in the future
|
||||
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
||||
for attrs in lex_attrs:
|
||||
if "settings" in attrs:
|
||||
continue
|
||||
lexeme = nlp.vocab[attrs["orth"]]
|
||||
lexeme.set_attrs(**attrs)
|
||||
|
||||
|
||||
@init_cli.command(
|
||||
"nlp",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
|
@ -89,7 +95,7 @@ def init_labels_cli(
|
|||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||
# fmt: on
|
||||
):
|
||||
"""Generate a JSON file for labels in the data. This helps speed up the
|
||||
"""Generate JSON files for the labels in the data. This helps speed up the
|
||||
training process, since spaCy won't have to preprocess the data to
|
||||
extract the labels."""
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
train = null
|
||||
dev = null
|
||||
vectors = null
|
||||
vocab_data = null
|
||||
init_tok2vec = null
|
||||
|
||||
[system]
|
||||
|
@ -11,8 +10,13 @@ gpu_allocator = null
|
|||
|
||||
[nlp]
|
||||
lang = null
|
||||
# List of pipeline component names, in order. The names should correspond to
|
||||
# components defined in the [components block]
|
||||
pipeline = []
|
||||
# Components that are loaded but disabled by default
|
||||
disabled = []
|
||||
# Optional callbacks to modify the nlp object before it's initialized, after
|
||||
# it's created and after the pipeline has been set up
|
||||
before_creation = null
|
||||
after_creation = null
|
||||
after_pipeline_creation = null
|
||||
|
@ -20,6 +24,7 @@ after_pipeline_creation = null
|
|||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.Tokenizer.v1"
|
||||
|
||||
# The pipeline components and their models
|
||||
[components]
|
||||
|
||||
# Readers for corpora like dev and train.
|
||||
|
@ -38,8 +43,7 @@ max_length = 0
|
|||
limit = 0
|
||||
# Apply some simply data augmentation, where we replace tokens with variations.
|
||||
# This is especially useful for punctuation and case replacement, to help
|
||||
# generalize beyond corpora that don't have smart-quotes, or only have smart
|
||||
# quotes, etc.
|
||||
# generalize beyond corpora that don't/only have smart quotes etc.
|
||||
augmenter = null
|
||||
|
||||
[corpora.dev]
|
||||
|
@ -53,6 +57,7 @@ gold_preproc = false
|
|||
max_length = 0
|
||||
# Limitation on number of training examples
|
||||
limit = 0
|
||||
# Optional callback for data augmentation
|
||||
augmenter = null
|
||||
|
||||
# Training hyper-parameters and additional features.
|
||||
|
@ -102,17 +107,18 @@ use_averages = false
|
|||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
# The 'initialize' step is run before training or pretraining. Components and
|
||||
# the tokenizer can each define their own arguments via their .initialize
|
||||
# methods that are populated by the config. This lets them gather resources like
|
||||
# lookup tables and build label sets, construct vocabularies, etc.
|
||||
# These settings are used when nlp.initialize() is called (typically before
|
||||
# training or pretraining). Components and the tokenizer can each define their
|
||||
# own arguments via their initialize methods that are populated by the config.
|
||||
# This lets them gather data resources, build label sets etc.
|
||||
[initialize]
|
||||
vocab_data = ${paths.vocab_data}
|
||||
lookups = null
|
||||
vectors = ${paths.vectors}
|
||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
||||
init_tok2vec = ${paths.init_tok2vec}
|
||||
# Data and lookups for vocabulary
|
||||
vocab_data = null
|
||||
lookups = null
|
||||
# Arguments passed to the tokenizer's initialize method
|
||||
tokenizer = {}
|
||||
# Arguments passed to the initialize methods of the components (keyed by component name)
|
||||
# Arguments for initialize methods of the components (keyed by component)
|
||||
components = {}
|
||||
|
|
|
@ -710,6 +710,9 @@ class Errors:
|
|||
"options: {modes}")
|
||||
E1012 = ("Entity spans and blocked/missing/outside spans should be "
|
||||
"provided to doc.set_ents as lists of `Span` objects.")
|
||||
E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
|
||||
"token itself. To set the morph from this MorphAnalysis, set from "
|
||||
"the string value with: `token.set_morph(str(other_morph))`.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class DanishDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
|
|
@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
|
|||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class GermanDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
|
|
@ -9,21 +9,9 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
|
|||
from .lemmatizer import GreekLemmatizer
|
||||
from ...lookups import Lookups
|
||||
from ...language import Language
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class GreekDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
|
|
@ -4,21 +4,9 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class IndonesianDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
|
|
@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_INFIXES
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class LuxembourgishDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
|
|
|
@ -3,21 +3,9 @@ from .stop_words import STOP_WORDS
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
||||
from ...language import Language
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class PortugueseDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
|
|
|
@ -7,21 +7,9 @@ from .lex_attrs import LEX_ATTRS
|
|||
from .lemmatizer import RussianLemmatizer
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class RussianDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
|
|
@ -2,21 +2,9 @@ from .stop_words import STOP_WORDS
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class SerbianDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
|
|
@ -1,21 +1,9 @@
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...util import load_config_from_str
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
class TamilDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
|
|
@ -10,13 +10,6 @@ DEFAULT_CONFIG = """
|
|||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.th.ThaiTokenizer"
|
||||
|
||||
[initialize]
|
||||
|
||||
[initialize.lookups]
|
||||
@misc = "spacy.LookupsDataLoader.v1"
|
||||
lang = ${nlp.lang}
|
||||
tables = ["lexeme_norm"]
|
||||
"""
|
||||
|
||||
|
||||
|
|
25
spacy/ml/featureextractor.py
Normal file
25
spacy/ml/featureextractor.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
from typing import List, Union, Callable, Tuple
|
||||
from thinc.types import Ints2d, Doc
|
||||
from thinc.api import Model, registry
|
||||
|
||||
|
||||
|
||||
@registry.layers("spacy.FeatureExtractor.v1")
|
||||
def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
|
||||
return Model("extract_features", forward, attrs={"columns": columns})
|
||||
|
||||
|
||||
def forward(model: Model[List[Doc], List[Ints2d]], docs, is_train: bool) -> Tuple[List[Ints2d], Callable]:
|
||||
columns = model.attrs["columns"]
|
||||
features: List[Ints2d] = []
|
||||
for doc in docs:
|
||||
if hasattr(doc, "to_array"):
|
||||
attrs = doc.to_array(columns)
|
||||
else:
|
||||
attrs = doc.doc.to_array(columns)[doc.start : doc.end]
|
||||
if attrs.ndim == 1:
|
||||
attrs = attrs.reshape((attrs.shape[0], 1))
|
||||
features.append(model.ops.asarray2i(attrs, dtype="uint64"))
|
||||
|
||||
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
|
||||
return features, backprop
|
|
@ -3,12 +3,13 @@ from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
|||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
|
||||
from thinc.api import Relu, residual, expand_window, FeatureExtractor
|
||||
from thinc.api import Relu, residual, expand_window
|
||||
|
||||
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
||||
from ...util import registry
|
||||
from ..extract_ngrams import extract_ngrams
|
||||
from ..staticvectors import StaticVectors
|
||||
from ..featureextractor import FeatureExtractor
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.TextCatCNN.v1")
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
from typing import Optional, List, Union
|
||||
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
||||
from thinc.api import Model, noop, list2ragged, ragged2list
|
||||
from thinc.api import FeatureExtractor, HashEmbed
|
||||
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
||||
from thinc.types import Floats2d
|
||||
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
||||
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
|
||||
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
||||
|
||||
from ...tokens import Doc
|
||||
from ...util import registry
|
||||
from ...ml import _character_embed
|
||||
from ..staticvectors import StaticVectors
|
||||
from ..featureextractor import FeatureExtractor
|
||||
from ...pipeline.tok2vec import Tok2VecListener
|
||||
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
||||
from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
||||
|
@ -98,7 +98,7 @@ def MultiHashEmbed(
|
|||
attributes using hash embedding, concatenates the results, and passes it
|
||||
through a feed-forward subnetwork to build a mixed representations.
|
||||
|
||||
The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have
|
||||
The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
|
||||
varying definitions depending on the Vocab of the Doc object passed in.
|
||||
Vectors from pretrained static vectors can also be incorporated into the
|
||||
concatenated representation.
|
||||
|
@ -115,7 +115,7 @@ def MultiHashEmbed(
|
|||
also_use_static_vectors (bool): Whether to also use static word vectors.
|
||||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||
"""
|
||||
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
seed = 7
|
||||
|
||||
def make_hash_embed(feature):
|
||||
|
@ -123,7 +123,7 @@ def MultiHashEmbed(
|
|||
seed += 1
|
||||
return HashEmbed(
|
||||
width,
|
||||
rows if feature == NORM else rows // 2,
|
||||
rows if feature == LOWER else rows // 2,
|
||||
column=cols.index(feature),
|
||||
seed=seed,
|
||||
dropout=0.0,
|
||||
|
@ -131,13 +131,13 @@ def MultiHashEmbed(
|
|||
|
||||
if also_embed_subwords:
|
||||
embeddings = [
|
||||
make_hash_embed(NORM),
|
||||
make_hash_embed(LOWER),
|
||||
make_hash_embed(PREFIX),
|
||||
make_hash_embed(SUFFIX),
|
||||
make_hash_embed(SHAPE),
|
||||
]
|
||||
else:
|
||||
embeddings = [make_hash_embed(NORM)]
|
||||
embeddings = [make_hash_embed(LOWER)]
|
||||
concat_size = width * (len(embeddings) + also_use_static_vectors)
|
||||
if also_use_static_vectors:
|
||||
model = chain(
|
||||
|
@ -180,13 +180,17 @@ def CharacterEmbed(
|
|||
of being in an arbitrary position depending on the word length.
|
||||
|
||||
The characters are embedded in a embedding table with a given number of rows,
|
||||
and the vectors concatenated. A hash-embedded vector of the NORM of the word is
|
||||
and the vectors concatenated. A hash-embedded vector of the LOWER of the word is
|
||||
also concatenated on, and the result is then passed through a feed-forward
|
||||
network to construct a single vector to represent the information.
|
||||
|
||||
feature (int or str): An attribute to embed, to concatenate with the characters.
|
||||
width (int): The width of the output vector and the feature embedding.
|
||||
<<<<<<< HEAD
|
||||
rows (int): The number of rows in the NORM hash embedding table.
|
||||
=======
|
||||
rows (int): The number of rows in the LOWER hash embedding table.
|
||||
>>>>>>> 300e5a9928fd226dfddbf7d5c22558f696bfa1af
|
||||
nM (int): The dimensionality of the character embeddings. Recommended values
|
||||
are between 16 and 64.
|
||||
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
||||
|
|
|
@ -149,7 +149,7 @@ class Morphologizer(Tagger):
|
|||
for example in get_examples():
|
||||
for i, token in enumerate(example.reference):
|
||||
pos = token.pos_
|
||||
morph = token.morph_
|
||||
morph = str(token.morph)
|
||||
# create and add the combined morph+POS label
|
||||
morph_dict = Morphology.feats_to_dict(morph)
|
||||
if pos:
|
||||
|
@ -167,7 +167,7 @@ class Morphologizer(Tagger):
|
|||
gold_array = []
|
||||
for i, token in enumerate(example.reference):
|
||||
pos = token.pos_
|
||||
morph = token.morph_
|
||||
morph = str(token.morph)
|
||||
morph_dict = Morphology.feats_to_dict(morph)
|
||||
if pos:
|
||||
morph_dict[self.POS_FEAT] = pos
|
||||
|
|
|
@ -268,6 +268,9 @@ class Tagger(Pipe):
|
|||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects..
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
labels: The labels to add to the component, typically generated by the
|
||||
`init labels` command. If no labels are provided, the get_examples
|
||||
callback is used to extract the labels from the data.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/tagger#initialize
|
||||
"""
|
||||
|
|
|
@ -355,6 +355,9 @@ class TextCategorizer(Pipe):
|
|||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
labels: The labels to add to the component, typically generated by the
|
||||
`init labels` command. If no labels are provided, the get_examples
|
||||
callback is used to extract the labels from the data.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
|
||||
"""
|
||||
|
|
|
@ -46,9 +46,9 @@ def test_doc_array_morph(en_vocab):
|
|||
words = ["Eat", "blue", "ham"]
|
||||
morph = ["Feat=V", "Feat=J", "Feat=N"]
|
||||
doc = Doc(en_vocab, words=words, morphs=morph)
|
||||
assert morph[0] == doc[0].morph_
|
||||
assert morph[1] == doc[1].morph_
|
||||
assert morph[2] == doc[2].morph_
|
||||
assert morph[0] == str(doc[0].morph)
|
||||
assert morph[1] == str(doc[1].morph)
|
||||
assert morph[2] == str(doc[2].morph)
|
||||
|
||||
feats_array = doc.to_array((ORTH, MORPH))
|
||||
assert feats_array[0][1] == doc[0].morph.key
|
||||
|
|
|
@ -319,15 +319,13 @@ def test_doc_from_array_morph(en_vocab):
|
|||
words = ["I", "live", "in", "New", "York", "."]
|
||||
morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
|
||||
# fmt: on
|
||||
doc = Doc(en_vocab, words=words)
|
||||
for i, morph in enumerate(morphs):
|
||||
doc[i].morph_ = morph
|
||||
doc = Doc(en_vocab, words=words, morphs=morphs)
|
||||
attrs = [MORPH]
|
||||
arr = doc.to_array(attrs)
|
||||
new_doc = Doc(en_vocab, words=words)
|
||||
new_doc.from_array(attrs, arr)
|
||||
assert [t.morph_ for t in new_doc] == morphs
|
||||
assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc]
|
||||
assert [str(t.morph) for t in new_doc] == morphs
|
||||
assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc]
|
||||
|
||||
|
||||
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||
|
@ -423,7 +421,7 @@ def test_has_annotation(en_vocab):
|
|||
|
||||
doc[0].tag_ = "A"
|
||||
doc[0].pos_ = "X"
|
||||
doc[0].morph_ = "Feat=Val"
|
||||
doc[0].set_morph("Feat=Val")
|
||||
doc[0].lemma_ = "a"
|
||||
doc[0].dep_ = "dep"
|
||||
doc[0].head = doc[1]
|
||||
|
@ -435,7 +433,7 @@ def test_has_annotation(en_vocab):
|
|||
|
||||
doc[1].tag_ = "A"
|
||||
doc[1].pos_ = "X"
|
||||
doc[1].morph_ = ""
|
||||
doc[1].set_morph("")
|
||||
doc[1].lemma_ = "a"
|
||||
doc[1].dep_ = "dep"
|
||||
doc.ents = [Span(doc, 0, 2, label="HELLO")]
|
||||
|
@ -533,5 +531,78 @@ def test_doc_ents_setter():
|
|||
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
||||
vocab = Vocab()
|
||||
ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
|
||||
ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"]
|
||||
doc = Doc(vocab, words=words, ents=ents)
|
||||
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
||||
|
||||
|
||||
def test_doc_morph_setter(en_tokenizer, de_tokenizer):
|
||||
doc1 = en_tokenizer("a b")
|
||||
doc1b = en_tokenizer("c d")
|
||||
doc2 = de_tokenizer("a b")
|
||||
|
||||
# unset values can be copied
|
||||
doc1[0].morph = doc1[1].morph
|
||||
assert doc1[0].morph.key == 0
|
||||
assert doc1[1].morph.key == 0
|
||||
|
||||
# morph values from the same vocab can be copied
|
||||
doc1[0].set_morph("Feat=Val")
|
||||
doc1[1].morph = doc1[0].morph
|
||||
assert doc1[0].morph == doc1[1].morph
|
||||
|
||||
# ... also across docs
|
||||
doc1b[0].morph = doc1[0].morph
|
||||
assert doc1[0].morph == doc1b[0].morph
|
||||
|
||||
doc2[0].set_morph("Feat2=Val2")
|
||||
|
||||
# the morph value must come from the same vocab
|
||||
with pytest.raises(ValueError):
|
||||
doc1[0].morph = doc2[0].morph
|
||||
|
||||
|
||||
def test_doc_init_iob():
|
||||
"""Test ents validation/normalization in Doc.__init__"""
|
||||
words = ["a", "b", "c", "d", "e"]
|
||||
ents = ["O"] * len(words)
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
assert doc.ents == ()
|
||||
|
||||
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-PERSON"]
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
assert len(doc.ents) == 2
|
||||
|
||||
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
assert len(doc.ents) == 3
|
||||
|
||||
# None is missing
|
||||
ents = ["B-PERSON", "I-PERSON", "O", None, "I-GPE"]
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
assert len(doc.ents) == 2
|
||||
|
||||
# empty tag is missing
|
||||
ents = ["", "B-PERSON", "O", "B-PERSON", "I-PERSON"]
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
assert len(doc.ents) == 2
|
||||
|
||||
# invalid IOB
|
||||
ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||
with pytest.raises(ValueError):
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
|
||||
# no dash
|
||||
ents = ["OPERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||
with pytest.raises(ValueError):
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
|
||||
# no ent type
|
||||
ents = ["O", "B-", "O", "I-PERSON", "I-GPE"]
|
||||
with pytest.raises(ValueError):
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
|
||||
# not strings or None
|
||||
ents = [0, "B-", "O", "I-PERSON", "I-GPE"]
|
||||
with pytest.raises(ValueError):
|
||||
doc = Doc(Vocab(), words=words, ents=ents)
|
||||
|
|
|
@ -4,13 +4,13 @@ import pytest
|
|||
@pytest.fixture
|
||||
def i_has(en_tokenizer):
|
||||
doc = en_tokenizer("I has")
|
||||
doc[0].morph_ = {"PronType": "prs"}
|
||||
doc[1].morph_ = {
|
||||
doc[0].set_morph({"PronType": "prs"})
|
||||
doc[1].set_morph({
|
||||
"VerbForm": "fin",
|
||||
"Tense": "pres",
|
||||
"Number": "sing",
|
||||
"Person": "three",
|
||||
}
|
||||
})
|
||||
|
||||
return doc
|
||||
|
||||
|
@ -47,20 +47,20 @@ def test_morph_get(i_has):
|
|||
def test_morph_set(i_has):
|
||||
assert i_has[0].morph.get("PronType") == ["prs"]
|
||||
# set by string
|
||||
i_has[0].morph_ = "PronType=unk"
|
||||
i_has[0].set_morph("PronType=unk")
|
||||
assert i_has[0].morph.get("PronType") == ["unk"]
|
||||
# set by string, fields are alphabetized
|
||||
i_has[0].morph_ = "PronType=123|NounType=unk"
|
||||
assert i_has[0].morph_ == "NounType=unk|PronType=123"
|
||||
i_has[0].set_morph("PronType=123|NounType=unk")
|
||||
assert str(i_has[0].morph) == "NounType=unk|PronType=123"
|
||||
# set by dict
|
||||
i_has[0].morph_ = {"AType": "123", "BType": "unk"}
|
||||
assert i_has[0].morph_ == "AType=123|BType=unk"
|
||||
i_has[0].set_morph({"AType": "123", "BType": "unk"})
|
||||
assert str(i_has[0].morph) == "AType=123|BType=unk"
|
||||
# set by string with multiple values, fields and values are alphabetized
|
||||
i_has[0].morph_ = "BType=c|AType=b,a"
|
||||
assert i_has[0].morph_ == "AType=a,b|BType=c"
|
||||
i_has[0].set_morph("BType=c|AType=b,a")
|
||||
assert str(i_has[0].morph) == "AType=a,b|BType=c"
|
||||
# set by dict with multiple values, fields and values are alphabetized
|
||||
i_has[0].morph_ = {"AType": "b,a", "BType": "c"}
|
||||
assert i_has[0].morph_ == "AType=a,b|BType=c"
|
||||
i_has[0].set_morph({"AType": "b,a", "BType": "c"})
|
||||
assert str(i_has[0].morph) == "AType=a,b|BType=c"
|
||||
|
||||
|
||||
def test_morph_str(i_has):
|
||||
|
@ -72,25 +72,25 @@ def test_morph_property(tokenizer):
|
|||
doc = tokenizer("a dog")
|
||||
|
||||
# set through token.morph_
|
||||
doc[0].morph_ = "PronType=prs"
|
||||
assert doc[0].morph_ == "PronType=prs"
|
||||
doc[0].set_morph("PronType=prs")
|
||||
assert str(doc[0].morph) == "PronType=prs"
|
||||
assert doc.to_array(["MORPH"])[0] != 0
|
||||
|
||||
# unset with token.morph
|
||||
doc[0].morph = 0
|
||||
doc[0].set_morph(0)
|
||||
assert doc.to_array(["MORPH"])[0] == 0
|
||||
|
||||
# empty morph is equivalent to "_"
|
||||
doc[0].morph_ = ""
|
||||
assert doc[0].morph_ == ""
|
||||
doc[0].set_morph("")
|
||||
assert str(doc[0].morph) == ""
|
||||
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
|
||||
|
||||
# "_" morph is also equivalent to empty morph
|
||||
doc[0].morph_ = "_"
|
||||
assert doc[0].morph_ == ""
|
||||
doc[0].set_morph("_")
|
||||
assert str(doc[0].morph) == ""
|
||||
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
|
||||
|
||||
# set through existing hash with token.morph
|
||||
tokenizer.vocab.strings.add("Feat=Val")
|
||||
doc[0].morph = tokenizer.vocab.strings.add("Feat=Val")
|
||||
assert doc[0].morph_ == "Feat=Val"
|
||||
doc[0].set_morph(tokenizer.vocab.strings.add("Feat=Val"))
|
||||
assert str(doc[0].morph) == "Feat=Val"
|
||||
|
|
|
@ -21,11 +21,11 @@ def test_doc_retokenize_merge(en_tokenizer):
|
|||
assert doc[4].text == "the beach boys"
|
||||
assert doc[4].text_with_ws == "the beach boys "
|
||||
assert doc[4].tag_ == "NAMED"
|
||||
assert doc[4].morph_ == "Number=Plur"
|
||||
assert str(doc[4].morph) == "Number=Plur"
|
||||
assert doc[5].text == "all night"
|
||||
assert doc[5].text_with_ws == "all night"
|
||||
assert doc[5].tag_ == "NAMED"
|
||||
assert doc[5].morph_ == "Number=Plur"
|
||||
assert str(doc[5].morph) == "Number=Plur"
|
||||
|
||||
|
||||
def test_doc_retokenize_merge_children(en_tokenizer):
|
||||
|
@ -201,6 +201,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
|
|||
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
|
||||
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
|
||||
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
|
||||
ents = ["O"] * len(heads)
|
||||
ents[0] = "B-PERSON"
|
||||
ents[1] = "I-PERSON"
|
||||
ents[10] = "B-GPE"
|
||||
ents[13] = "B-PERSON"
|
||||
ents[14] = "I-PERSON"
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = Doc(
|
||||
|
@ -269,7 +275,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
|||
# if there is a parse, span.root provides default values
|
||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||
heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
|
||||
ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)]
|
||||
ents = ["O"] * len(words)
|
||||
ents[3] = "B-ent-de"
|
||||
ents[4] = "I-ent-de"
|
||||
ents[5] = "B-ent-fg"
|
||||
ents[6] = "I-ent-fg"
|
||||
deps = ["dep"] * len(words)
|
||||
en_vocab.strings.add("ent-de")
|
||||
en_vocab.strings.add("ent-fg")
|
||||
|
@ -292,7 +302,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
|||
# check that B is preserved if span[start] is B
|
||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||
heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
|
||||
ents = [("ent-de", 3, 5), ("ent-de", 5, 7)]
|
||||
ents = ["O"] * len(words)
|
||||
ents[3] = "B-ent-de"
|
||||
ents[4] = "I-ent-de"
|
||||
ents[5] = "B-ent-de"
|
||||
ents[6] = "I-ent-de"
|
||||
deps = ["dep"] * len(words)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||
with doc.retokenize() as retokenizer:
|
||||
|
|
|
@ -27,11 +27,11 @@ def test_doc_retokenize_split(en_vocab):
|
|||
assert doc[0].text == "Los"
|
||||
assert doc[0].head.text == "Angeles"
|
||||
assert doc[0].idx == 0
|
||||
assert doc[0].morph_ == "Number=Sing"
|
||||
assert str(doc[0].morph) == "Number=Sing"
|
||||
assert doc[1].idx == 3
|
||||
assert doc[1].text == "Angeles"
|
||||
assert doc[1].head.text == "start"
|
||||
assert doc[1].morph_ == "Number=Sing"
|
||||
assert str(doc[1].morph) == "Number=Sing"
|
||||
assert doc[2].text == "start"
|
||||
assert doc[2].head.text == "."
|
||||
assert doc[3].text == "."
|
||||
|
|
|
@ -9,7 +9,7 @@ def doc(en_vocab):
|
|||
tags = ["VBP", "NN", "NN"]
|
||||
heads = [0, 0, 0]
|
||||
deps = ["ROOT", "dobj", "dobj"]
|
||||
ents = [("ORG", 1, 2)]
|
||||
ents = ["O", "B-ORG", "O"]
|
||||
return Doc(
|
||||
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
|
||||
)
|
||||
|
|
|
@ -236,13 +236,13 @@ def test_matcher_subset_value_operator(en_vocab):
|
|||
matcher.add("M", [pattern])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
assert len(matcher(doc)) == 3
|
||||
doc[0].morph_ = "Feat=Val"
|
||||
doc[0].set_morph("Feat=Val")
|
||||
assert len(matcher(doc)) == 3
|
||||
doc[0].morph_ = "Feat=Val|Feat2=Val2"
|
||||
doc[0].set_morph("Feat=Val|Feat2=Val2")
|
||||
assert len(matcher(doc)) == 3
|
||||
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
|
||||
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
|
||||
assert len(matcher(doc)) == 2
|
||||
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
|
||||
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
|
||||
assert len(matcher(doc)) == 2
|
||||
|
||||
# IS_SUBSET acts like "IN" for attrs other than MORPH
|
||||
|
@ -268,11 +268,11 @@ def test_matcher_superset_value_operator(en_vocab):
|
|||
matcher.add("M", [pattern])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
assert len(matcher(doc)) == 0
|
||||
doc[0].morph_ = "Feat=Val|Feat2=Val2"
|
||||
doc[0].set_morph("Feat=Val|Feat2=Val2")
|
||||
assert len(matcher(doc)) == 0
|
||||
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
|
||||
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
|
||||
assert len(matcher(doc)) == 1
|
||||
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
|
||||
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
|
||||
assert len(matcher(doc)) == 1
|
||||
|
||||
# IS_SUPERSET with more than one value only matches for MORPH
|
||||
|
@ -310,9 +310,9 @@ def test_matcher_morph_handling(en_vocab):
|
|||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
assert len(matcher(doc)) == 0
|
||||
|
||||
doc[0].morph_ = "Feat2=Val2|Feat1=Val1"
|
||||
doc[0].set_morph("Feat2=Val2|Feat1=Val1")
|
||||
assert len(matcher(doc)) == 2
|
||||
doc[0].morph_ = "Feat1=Val1|Feat2=Val2"
|
||||
doc[0].set_morph("Feat1=Val1|Feat2=Val2")
|
||||
assert len(matcher(doc)) == 2
|
||||
|
||||
# multiple values are split
|
||||
|
@ -324,9 +324,9 @@ def test_matcher_morph_handling(en_vocab):
|
|||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
assert len(matcher(doc)) == 0
|
||||
|
||||
doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1"
|
||||
doc[0].set_morph("Feat2=Val2,Val3|Feat1=Val1")
|
||||
assert len(matcher(doc)) == 1
|
||||
doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2"
|
||||
doc[0].set_morph("Feat1=Val1,Val3|Feat2=Val2")
|
||||
assert len(matcher(doc)) == 2
|
||||
|
||||
|
||||
|
@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab):
|
|||
doc2 = Doc(en_vocab, words=["Test"])
|
||||
doc2[0].tag_ = "TAG"
|
||||
doc2[0].pos_ = "X"
|
||||
doc2[0].morph_ = "Feat=Val"
|
||||
doc2[0].set_morph("Feat=Val")
|
||||
doc2[0].lemma_ = "LEMMA"
|
||||
doc3 = Doc(en_vocab, words=["Test"])
|
||||
# DEP requires DEP
|
||||
|
|
|
@ -190,7 +190,7 @@ def test_phrase_matcher_validation(en_vocab):
|
|||
doc2 = Doc(en_vocab, words=["Test"])
|
||||
doc2[0].tag_ = "TAG"
|
||||
doc2[0].pos_ = "X"
|
||||
doc2[0].morph_ = "Feat=Val"
|
||||
doc2[0].set_morph("Feat=Val")
|
||||
doc3 = Doc(en_vocab, words=["Test"])
|
||||
matcher = PhraseMatcher(en_vocab, validate=True)
|
||||
with pytest.warns(UserWarning):
|
||||
|
@ -217,7 +217,7 @@ def test_attr_pipeline_checks(en_vocab):
|
|||
doc2 = Doc(en_vocab, words=["Test"])
|
||||
doc2[0].tag_ = "TAG"
|
||||
doc2[0].pos_ = "X"
|
||||
doc2[0].morph_ = "Feat=Val"
|
||||
doc2[0].set_morph("Feat=Val")
|
||||
doc2[0].lemma_ = "LEMMA"
|
||||
doc3 = Doc(en_vocab, words=["Test"])
|
||||
# DEP requires DEP
|
||||
|
|
|
@ -339,7 +339,6 @@ def test_ner_warns_no_lookups(caplog):
|
|||
nlp.vocab.lookups = Lookups()
|
||||
assert not len(nlp.vocab.lookups)
|
||||
nlp.add_pipe("ner")
|
||||
nlp.config["initialize"]["lookups"] = None
|
||||
with caplog.at_level(logging.DEBUG):
|
||||
nlp.initialize()
|
||||
assert "W033" in caplog.text
|
||||
|
|
|
@ -69,9 +69,9 @@ def test_attributeruler_init(nlp, pattern_dicts):
|
|||
a.add(**p)
|
||||
doc = nlp("This is a test.")
|
||||
assert doc[2].lemma_ == "the"
|
||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||
assert doc[3].lemma_ == "cat"
|
||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
||||
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||
assert doc.has_annotation("LEMMA")
|
||||
assert doc.has_annotation("MORPH")
|
||||
|
||||
|
@ -81,9 +81,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
|||
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
||||
doc = nlp("This is a test.")
|
||||
assert doc[2].lemma_ == "the"
|
||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||
assert doc[3].lemma_ == "cat"
|
||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
||||
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||
assert doc.has_annotation("LEMMA")
|
||||
assert doc.has_annotation("MORPH")
|
||||
nlp.remove_pipe("attribute_ruler")
|
||||
|
@ -94,9 +94,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
|||
)
|
||||
doc = nlp("This is a test.")
|
||||
assert doc[2].lemma_ == "the"
|
||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||
assert doc[3].lemma_ == "cat"
|
||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
||||
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||
assert doc.has_annotation("LEMMA")
|
||||
assert doc.has_annotation("MORPH")
|
||||
|
||||
|
@ -106,9 +106,9 @@ def test_attributeruler_score(nlp, pattern_dicts):
|
|||
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
||||
doc = nlp("This is a test.")
|
||||
assert doc[2].lemma_ == "the"
|
||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||
assert doc[3].lemma_ == "cat"
|
||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
||||
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||
|
||||
dev_examples = [
|
||||
Example.from_dict(
|
||||
|
@ -150,10 +150,10 @@ def test_attributeruler_tag_map(nlp, tag_map):
|
|||
for i in range(len(doc)):
|
||||
if i == 4:
|
||||
assert doc[i].pos_ == "PUNCT"
|
||||
assert doc[i].morph_ == "PunctType=peri"
|
||||
assert str(doc[i].morph) == "PunctType=peri"
|
||||
else:
|
||||
assert doc[i].pos_ == ""
|
||||
assert doc[i].morph_ == ""
|
||||
assert str(doc[i].morph) == ""
|
||||
|
||||
|
||||
def test_attributeruler_morph_rules(nlp, morph_rules):
|
||||
|
@ -168,11 +168,11 @@ def test_attributeruler_morph_rules(nlp, morph_rules):
|
|||
for i in range(len(doc)):
|
||||
if i != 2:
|
||||
assert doc[i].pos_ == ""
|
||||
assert doc[i].morph_ == ""
|
||||
assert str(doc[i].morph) == ""
|
||||
else:
|
||||
assert doc[2].pos_ == "DET"
|
||||
assert doc[2].lemma_ == "a"
|
||||
assert doc[2].morph_ == "Case=Nom"
|
||||
assert str(doc[2].morph) == "Case=Nom"
|
||||
|
||||
|
||||
def test_attributeruler_indices(nlp):
|
||||
|
@ -194,14 +194,14 @@ def test_attributeruler_indices(nlp):
|
|||
for i in range(len(doc)):
|
||||
if i == 1:
|
||||
assert doc[i].lemma_ == "was"
|
||||
assert doc[i].morph_ == "Case=Nom|Number=Sing"
|
||||
assert str(doc[i].morph) == "Case=Nom|Number=Sing"
|
||||
elif i == 2:
|
||||
assert doc[i].lemma_ == "the"
|
||||
assert doc[i].morph_ == "Case=Nom|Number=Plur"
|
||||
assert str(doc[i].morph) == "Case=Nom|Number=Plur"
|
||||
elif i == 3:
|
||||
assert doc[i].lemma_ == "cat"
|
||||
else:
|
||||
assert doc[i].morph_ == ""
|
||||
assert str(doc[i].morph) == ""
|
||||
# raises an error when trying to modify a token outside of the match
|
||||
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
|
||||
with pytest.raises(ValueError):
|
||||
|
|
|
@ -91,7 +91,7 @@ def test_overfitting_IO():
|
|||
doc = nlp(test_text)
|
||||
gold_morphs = ["Feat=N", "Feat=V", "", ""]
|
||||
gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
|
||||
assert [t.morph_ for t in doc] == gold_morphs
|
||||
assert [str(t.morph) for t in doc] == gold_morphs
|
||||
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||
|
||||
# Also test the results are still the same after IO
|
||||
|
@ -99,5 +99,5 @@ def test_overfitting_IO():
|
|||
nlp.to_disk(tmp_dir)
|
||||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
doc2 = nlp2(test_text)
|
||||
assert [t.morph_ for t in doc2] == gold_morphs
|
||||
assert [str(t.morph) for t in doc2] == gold_morphs
|
||||
assert [t.pos_ for t in doc2] == gold_pos_tags
|
||||
|
|
|
@ -59,7 +59,7 @@ def test_issue3012(en_vocab):
|
|||
words = ["This", "is", "10", "%", "."]
|
||||
tags = ["DT", "VBZ", "CD", "NN", "."]
|
||||
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
|
||||
ents = [("PERCENT", 2, 4)]
|
||||
ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
|
||||
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
|
||||
assert doc.has_annotation("TAG")
|
||||
expected = ("10", "NUM", "CD", "PERCENT")
|
||||
|
|
|
@ -76,7 +76,7 @@ def tagged_doc():
|
|||
for i in range(len(tags)):
|
||||
doc[i].tag_ = tags[i]
|
||||
doc[i].pos_ = pos[i]
|
||||
doc[i].morph_ = morphs[i]
|
||||
doc[i].set_morph(morphs[i])
|
||||
if i > 0:
|
||||
doc[i].is_sent_start = False
|
||||
return doc
|
||||
|
@ -184,7 +184,7 @@ def test_ner_per_type(en_vocab):
|
|||
doc = Doc(
|
||||
en_vocab,
|
||||
words=input_.split(" "),
|
||||
ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
|
||||
ents=["B-CARDINAL", "O", "B-CARDINAL"],
|
||||
)
|
||||
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
||||
example = Example.from_dict(doc, {"entities": entities})
|
||||
|
@ -209,7 +209,7 @@ def test_ner_per_type(en_vocab):
|
|||
doc = Doc(
|
||||
en_vocab,
|
||||
words=input_.split(" "),
|
||||
ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
|
||||
ents=["B-ORG", "O", "O", "O", "O", "B-GPE", "B-ORG", "O", "O", "O"],
|
||||
)
|
||||
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
||||
example = Example.from_dict(doc, {"entities": entities})
|
||||
|
@ -242,7 +242,7 @@ def test_tag_score(tagged_doc):
|
|||
gold = {
|
||||
"tags": [t.tag_ for t in tagged_doc],
|
||||
"pos": [t.pos_ for t in tagged_doc],
|
||||
"morphs": [t.morph_ for t in tagged_doc],
|
||||
"morphs": [str(t.morph) for t in tagged_doc],
|
||||
"sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc],
|
||||
}
|
||||
example = Example.from_dict(tagged_doc, gold)
|
||||
|
@ -259,7 +259,7 @@ def test_tag_score(tagged_doc):
|
|||
tags[0] = "NN"
|
||||
pos = [t.pos_ for t in tagged_doc]
|
||||
pos[1] = "X"
|
||||
morphs = [t.morph_ for t in tagged_doc]
|
||||
morphs = [str(t.morph) for t in tagged_doc]
|
||||
morphs[1] = "Number=sing"
|
||||
morphs[2] = "Number=plur"
|
||||
gold = {
|
||||
|
|
|
@ -113,7 +113,7 @@ def test_Example_from_dict_with_morphology(annots):
|
|||
predicted = Doc(vocab, words=annots["words"])
|
||||
example = Example.from_dict(predicted, annots)
|
||||
for i, token in enumerate(example.reference):
|
||||
assert token.morph_ == annots["morphs"][i]
|
||||
assert str(token.morph) == annots["morphs"][i]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
|
|
@ -30,7 +30,12 @@ def doc(en_vocab):
|
|||
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
|
||||
deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
|
||||
lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
|
||||
ents = (("PERSON", 0, 2), ("LOC", 5, 7), ("GPE", 8, 9))
|
||||
ents = ["O"] * len(words)
|
||||
ents[0] = "B-PERSON"
|
||||
ents[1] = "I-PERSON"
|
||||
ents[5] = "B-LOC"
|
||||
ents[6] = "I-LOC"
|
||||
ents[8] = "B-GPE"
|
||||
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
||||
# fmt: on
|
||||
doc = Doc(
|
||||
|
@ -455,7 +460,7 @@ def test_roundtrip_docs_to_docbin(doc):
|
|||
idx = [t.idx for t in doc]
|
||||
tags = [t.tag_ for t in doc]
|
||||
pos = [t.pos_ for t in doc]
|
||||
morphs = [t.morph_ for t in doc]
|
||||
morphs = [str(t.morph) for t in doc]
|
||||
lemmas = [t.lemma_ for t in doc]
|
||||
deps = [t.dep_ for t in doc]
|
||||
heads = [t.head.i for t in doc]
|
||||
|
@ -477,7 +482,7 @@ def test_roundtrip_docs_to_docbin(doc):
|
|||
assert idx == [t.idx for t in reloaded_example.reference]
|
||||
assert tags == [t.tag_ for t in reloaded_example.reference]
|
||||
assert pos == [t.pos_ for t in reloaded_example.reference]
|
||||
assert morphs == [t.morph_ for t in reloaded_example.reference]
|
||||
assert morphs == [str(t.morph) for t in reloaded_example.reference]
|
||||
assert lemmas == [t.lemma_ for t in reloaded_example.reference]
|
||||
assert deps == [t.dep_ for t in reloaded_example.reference]
|
||||
assert heads == [t.head.i for t in reloaded_example.reference]
|
||||
|
|
|
@ -101,7 +101,7 @@ class DocBin:
|
|||
self.strings.add(token.text)
|
||||
self.strings.add(token.tag_)
|
||||
self.strings.add(token.lemma_)
|
||||
self.strings.add(token.morph_)
|
||||
self.strings.add(str(token.morph))
|
||||
self.strings.add(token.dep_)
|
||||
self.strings.add(token.ent_type_)
|
||||
self.strings.add(token.ent_kb_id_)
|
||||
|
|
|
@ -213,8 +213,9 @@ cdef class Doc:
|
|||
sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
|
||||
the same length as words, to assign as token.is_sent_start. Will be
|
||||
overridden by heads if heads is provided. Defaults to None.
|
||||
ents (Optional[List[Tuple[Union[str, int], int, int]]]): A list of
|
||||
(label, start, end) tuples to assign as doc.ents. Defaults to None.
|
||||
ents (Optional[List[str]]): A list of unicode strings, of the same
|
||||
length as words, as IOB tags to assign as token.ent_iob and
|
||||
token.ent_type. Defaults to None.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/doc#init
|
||||
"""
|
||||
|
@ -275,16 +276,55 @@ cdef class Doc:
|
|||
sent_starts[i] = -1
|
||||
elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
|
||||
sent_starts[i] = 0
|
||||
ent_iobs = None
|
||||
ent_types = None
|
||||
if ents is not None:
|
||||
iob_strings = Token.iob_strings()
|
||||
# make valid IOB2 out of IOB1 or IOB2
|
||||
for i, ent in enumerate(ents):
|
||||
if ent is "":
|
||||
ents[i] = None
|
||||
elif ent is not None and not isinstance(ent, str):
|
||||
raise ValueError(Errors.E177.format(tag=ent))
|
||||
if i < len(ents) - 1:
|
||||
# OI -> OB
|
||||
if (ent is None or ent.startswith("O")) and \
|
||||
(ents[i+1] is not None and ents[i+1].startswith("I")):
|
||||
ents[i+1] = "B" + ents[i+1][1:]
|
||||
# B-TYPE1 I-TYPE2 or I-TYPE1 I-TYPE2 -> B/I-TYPE1 B-TYPE2
|
||||
if ent is not None and ents[i+1] is not None and \
|
||||
(ent.startswith("B") or ent.startswith("I")) and \
|
||||
ents[i+1].startswith("I") and \
|
||||
ent[1:] != ents[i+1][1:]:
|
||||
ents[i+1] = "B" + ents[i+1][1:]
|
||||
ent_iobs = []
|
||||
ent_types = []
|
||||
for ent in ents:
|
||||
if ent is None:
|
||||
ent_iobs.append(iob_strings.index(""))
|
||||
ent_types.append("")
|
||||
elif ent == "O":
|
||||
ent_iobs.append(iob_strings.index(ent))
|
||||
ent_types.append("")
|
||||
else:
|
||||
if len(ent) < 3 or ent[1] != "-":
|
||||
raise ValueError(Errors.E177.format(tag=ent))
|
||||
ent_iob, ent_type = ent.split("-", 1)
|
||||
if ent_iob not in iob_strings:
|
||||
raise ValueError(Errors.E177.format(tag=ent))
|
||||
ent_iob = iob_strings.index(ent_iob)
|
||||
ent_iobs.append(ent_iob)
|
||||
ent_types.append(ent_type)
|
||||
headings = []
|
||||
values = []
|
||||
annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts]
|
||||
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START]
|
||||
annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts, ent_iobs, ent_types]
|
||||
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START, ENT_IOB, ENT_TYPE]
|
||||
for a, annot in enumerate(annotations):
|
||||
if annot is not None:
|
||||
if len(annot) != len(words):
|
||||
raise ValueError(Errors.E189)
|
||||
headings.append(possible_headings[a])
|
||||
if annot is not heads and annot is not sent_starts:
|
||||
if annot is not heads and annot is not sent_starts and annot is not ent_iobs:
|
||||
values.extend(annot)
|
||||
for value in values:
|
||||
self.vocab.strings.add(value)
|
||||
|
@ -296,7 +336,7 @@ cdef class Doc:
|
|||
j = 0
|
||||
for annot in annotations:
|
||||
if annot:
|
||||
if annot is heads or annot is sent_starts:
|
||||
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = annot[i]
|
||||
|
@ -317,8 +357,6 @@ cdef class Doc:
|
|||
attrs[i, j] = self.vocab.strings[annot[i]]
|
||||
j += 1
|
||||
self.from_array(headings, attrs)
|
||||
if ents is not None:
|
||||
self.ents = ents
|
||||
|
||||
@property
|
||||
def _(self):
|
||||
|
@ -1210,7 +1248,7 @@ cdef class Doc:
|
|||
for token in self:
|
||||
strings.add(token.tag_)
|
||||
strings.add(token.lemma_)
|
||||
strings.add(token.morph_)
|
||||
strings.add(str(token.morph))
|
||||
strings.add(token.dep_)
|
||||
strings.add(token.ent_type_)
|
||||
strings.add(token.ent_kb_id_)
|
||||
|
|
|
@ -215,20 +215,20 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
return MorphAnalysis.from_id(self.vocab, self.c.morph)
|
||||
|
||||
def __set__(self, attr_t morph):
|
||||
if morph == 0:
|
||||
self.c.morph = morph
|
||||
elif morph in self.vocab.strings:
|
||||
self.morph_ = self.vocab.strings[morph]
|
||||
else:
|
||||
raise ValueError(Errors.E1009.format(val=morph))
|
||||
def __set__(self, MorphAnalysis morph):
|
||||
# Check that the morph has the same vocab
|
||||
if self.vocab != morph.vocab:
|
||||
raise ValueError(Errors.E1013)
|
||||
self.c.morph = morph.c.key
|
||||
|
||||
property morph_:
|
||||
def __get__(self):
|
||||
return str(MorphAnalysis.from_id(self.vocab, self.c.morph))
|
||||
|
||||
def __set__(self, features):
|
||||
cdef hash_t key = self.vocab.morphology.add(features)
|
||||
def set_morph(self, features):
|
||||
cdef hash_t key
|
||||
if features is 0:
|
||||
self.c.morph = 0
|
||||
else:
|
||||
if isinstance(features, int):
|
||||
features = self.vocab.strings[features]
|
||||
key = self.vocab.morphology.add(features)
|
||||
self.c.morph = key
|
||||
|
||||
@property
|
||||
|
|
|
@ -207,6 +207,7 @@ def conllu_sentence_to_doc(
|
|||
pos=poses,
|
||||
deps=deps,
|
||||
lemmas=lemmas,
|
||||
morphs=morphs,
|
||||
heads=heads,
|
||||
)
|
||||
for i in range(len(doc)):
|
||||
|
|
|
@ -46,7 +46,7 @@ def create_jsonl_reader(
|
|||
|
||||
|
||||
@util.registry.readers("spacy.read_labels.v1")
|
||||
def read_labels(path: Path, *, require: bool=False):
|
||||
def read_labels(path: Path, *, require: bool = False):
|
||||
# I decided not to give this a generic name, because I don't want people to
|
||||
# use it for arbitrary stuff, as I want this require arg with default False.
|
||||
if not require and not path.exists():
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from collections import Iterable as IterableInstance
|
||||
from collections.abc import Iterable as IterableInstance
|
||||
import warnings
|
||||
import numpy
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
@ -226,7 +226,7 @@ cdef class Example:
|
|||
"TAG": [t.tag_ for t in self.reference],
|
||||
"LEMMA": [t.lemma_ for t in self.reference],
|
||||
"POS": [t.pos_ for t in self.reference],
|
||||
"MORPH": [t.morph_ for t in self.reference],
|
||||
"MORPH": [str(t.morph) for t in self.reference],
|
||||
"HEAD": [t.head.i for t in self.reference],
|
||||
"DEP": [t.dep_ for t in self.reference],
|
||||
"SENT_START": [int(bool(t.is_sent_start)) for t in self.reference]
|
||||
|
|
|
@ -44,7 +44,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
|||
if include_annotation["POS"]:
|
||||
json_token["pos"] = token.pos_
|
||||
if include_annotation["MORPH"]:
|
||||
json_token["morph"] = token.morph_
|
||||
json_token["morph"] = str(token.morph)
|
||||
if include_annotation["LEMMA"]:
|
||||
json_token["lemma"] = token.lemma_
|
||||
if include_annotation["DEP"]:
|
||||
|
|
|
@ -144,9 +144,9 @@ argument that connects to the shared `tok2vec` component in the pipeline.
|
|||
Construct an embedding layer that separately embeds a number of lexical
|
||||
attributes using hash embedding, concatenates the results, and passes it through
|
||||
a feed-forward subnetwork to build mixed representations. The features used are
|
||||
the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying definitions
|
||||
depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained
|
||||
static vectors can also be incorporated into the concatenated representation.
|
||||
the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a
|
||||
[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static
|
||||
vectors can also be incorporated into the concatenated representation.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
|
@ -291,6 +291,24 @@ on [static vectors](/usage/embeddings-transformers#static-vectors) for details.
|
|||
| `key_attr` | Defaults to `"ORTH"`. ~~str~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ |
|
||||
|
||||
### spacy.FeatureExtractor.v1 {#FeatureExtractor}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.FeatureExtractor.v1"
|
||||
> columns = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
|
||||
> ```
|
||||
|
||||
Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list
|
||||
of feature names to extract, which should refer to token attributes.
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------ |
|
||||
| `columns` | The token attributes to extract. ~~List[Union[int, str]]~~ |
|
||||
| **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |
|
||||
|
||||
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
|
||||
|
||||
The following architectures are provided by the package
|
||||
|
|
|
@ -186,21 +186,53 @@ This functionality was previously available as part of the command `init-model`.
|
|||
</Infobox>
|
||||
|
||||
```cli
|
||||
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--lexemes-jsonl] [--verbose]
|
||||
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
|
||||
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
|
||||
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
||||
| `--lexemes-jsonl`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~ |
|
||||
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
||||
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
|
||||
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
|
||||
| Name | Description |
|
||||
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
|
||||
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
|
||||
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
||||
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
||||
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
|
||||
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
|
||||
|
||||
### init labels {#init-labels new="3" tag="command"}
|
||||
|
||||
Generate JSON files for the labels in the data. This helps speed up the training
|
||||
process, since spaCy won't have to preprocess the data to extract the labels.
|
||||
After generating the labels, you can provide them to components that accept a
|
||||
`labels` argument on initialization via the
|
||||
[`[initialize]`](/api/data-formats#config-initialize) block of your config.
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [initialize.components.ner]
|
||||
>
|
||||
> [initialize.components.ner.labels]
|
||||
> @readers = "spacy.read_labels.v1"
|
||||
> path = "corpus/labels/ner.json
|
||||
> ```
|
||||
|
||||
```cli
|
||||
$ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [--gpu-id] [overrides]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The final trained pipeline and the best trained pipeline. |
|
||||
|
||||
## convert {#convert tag="command"}
|
||||
|
||||
|
|
|
@ -238,8 +238,6 @@ without requiring them at runtime when you load the trained pipeline back in.
|
|||
> data_path = "/path/to/component_data"
|
||||
> ```
|
||||
|
||||
<!-- TODO: -->
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `components` | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~ |
|
||||
|
@ -454,15 +452,20 @@ example = Example.from_dict(doc, gold_dict)
|
|||
|
||||
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
||||
|
||||
To populate a pipeline's vocabulary, you can use the
|
||||
[`spacy init vectors`](/api/cli#init-vectors) command and load in a
|
||||
[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one
|
||||
lexical entry per line via the `--jsonl-loc` option. The first line defines the
|
||||
language and vocabulary settings. All other lines are expected to be JSON
|
||||
objects describing an individual lexeme. The lexical attributes will be then set
|
||||
as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab`
|
||||
command outputs a ready-to-use spaCy pipeline with a `Vocab` containing the
|
||||
lexical data.
|
||||
This data file can be provided via the `vocab_data` setting in the
|
||||
`[initialize]` block of the training config to pre-define the lexical data to
|
||||
initialize the `nlp` object's vocabulary with. The file should contain one
|
||||
lexical entry per line. The first line defines the language and vocabulary
|
||||
settings. All other lines are expected to be JSON objects describing an
|
||||
individual lexeme. The lexical attributes will be then set as attributes on
|
||||
spaCy's [`Lexeme`](/api/lexeme#attributes) object.
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [initialize]
|
||||
> vocab_data = "/path/to/vocab-data.jsonl"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### First line
|
||||
|
|
|
@ -21,8 +21,9 @@ non-projective parses.
|
|||
The parser is trained using an **imitation learning objective**. It follows the
|
||||
actions predicted by the current weights, and at each state, determines which
|
||||
actions are compatible with the optimal parse that could be reached from the
|
||||
current state. The weights are updated such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note
|
||||
that more than one action may be optimal for a given state.
|
||||
current state. The weights are updated such that the scores assigned to the set
|
||||
of optimal actions is increased, while scores assigned to other actions are
|
||||
decreased. Note that more than one action may be optimal for a given state.
|
||||
|
||||
## Config and implementation {#config}
|
||||
|
||||
|
@ -139,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## DependencyParser.initialize {#initialize tag="method"}
|
||||
## DependencyParser.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the component for training. `get_examples` should be a function that
|
||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||
|
@ -148,7 +149,10 @@ training data or a representative sample. Initialization includes validating the
|
|||
network,
|
||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data. This method is typically called
|
||||
by [`Language.initialize`](/api/language#initialize).
|
||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||
arguments it receives via the
|
||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||
config.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||
|
||||
|
@ -162,12 +166,22 @@ This method was previously called `begin_training`.
|
|||
> parser = nlp.add_pipe("parser")
|
||||
> parser.initialize(lambda: [], nlp=nlp)
|
||||
> ```
|
||||
>
|
||||
> ```ini
|
||||
> ### config.cfg
|
||||
> [initialize.components.parser]
|
||||
>
|
||||
> [initialize.components.parser.labels]
|
||||
> @readers = "spacy.read_labels.v1"
|
||||
> path = "corpus/labels/parser.json
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
||||
|
||||
## DependencyParser.predict {#predict tag="method"}
|
||||
|
||||
|
|
|
@ -31,21 +31,21 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
|||
> doc = Doc(nlp.vocab, words=words, spaces=spaces)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
|
||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
||||
| `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `lemmas` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
|
||||
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ |
|
||||
| `ents` <Tag variant="new">3</Tag> | A list of `(label, start, end)` tuples to assign as `doc.ents`. Note that the `start` and `end` indices here refer to the token indices. Defaults to `None`. ~~Optional[List[Tuple[Union[str, int], int, int]]]~~ |
|
||||
| Name | Description |
|
||||
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
|
||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
||||
| `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `lemmas` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
|
||||
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ |
|
||||
| `ents` <Tag variant="new">3</Tag> | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
|
||||
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
||||
|
||||
|
@ -503,7 +503,9 @@ invalidated, although they may accidentally continue to work.
|
|||
Mark a span for merging. The `attrs` will be applied to the resulting token (if
|
||||
they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
|
||||
underlying lexeme (if they're context-independent lexical attributes like
|
||||
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute names to values.
|
||||
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided
|
||||
using the `"_"` key and specifying a dictionary that maps attribute names to
|
||||
values.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
|
@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## EntityLinker.initialize {#initialize tag="method"}
|
||||
## EntityLinker.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the component for training. `get_examples` should be a function that
|
||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||
|
|
|
@ -43,7 +43,7 @@ architectures and their arguments and hyperparameters.
|
|||
|
||||
| Setting | Description |
|
||||
| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
|
@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## EntityRecognizer.initialize {#initialize tag="method"}
|
||||
## EntityRecognizer.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the component for training. `get_examples` should be a function that
|
||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||
|
@ -138,7 +138,10 @@ training data or a representative sample. Initialization includes validating the
|
|||
network,
|
||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data. This method is typically called
|
||||
by [`Language.initialize`](/api/language#initialize).
|
||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||
arguments it receives via the
|
||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||
config.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||
|
||||
|
@ -152,12 +155,22 @@ This method was previously called `begin_training`.
|
|||
> ner = nlp.add_pipe("ner")
|
||||
> ner.initialize(lambda: [], nlp=nlp)
|
||||
> ```
|
||||
>
|
||||
> ```ini
|
||||
> ### config.cfg
|
||||
> [initialize.components.ner]
|
||||
>
|
||||
> [initialize.components.ner.labels]
|
||||
> @readers = "spacy.read_labels.v1"
|
||||
> path = "corpus/labels/ner.json
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
||||
|
||||
## EntityRecognizer.predict {#predict tag="method"}
|
||||
|
||||
|
|
|
@ -202,7 +202,7 @@ more efficient than processing texts one-by-one.
|
|||
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
|
||||
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
|
||||
|
||||
## Language.initialize {#initialize tag="method"}
|
||||
## Language.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the pipeline for training and return an
|
||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the
|
||||
|
|
|
@ -126,7 +126,10 @@ training data or a representative sample. Initialization includes validating the
|
|||
network,
|
||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data. This method is typically called
|
||||
by [`Language.initialize`](/api/language#initialize).
|
||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||
arguments it receives via the
|
||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||
config.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -134,12 +137,22 @@ by [`Language.initialize`](/api/language#initialize).
|
|||
> morphologizer = nlp.add_pipe("morphologizer")
|
||||
> morphologizer.initialize(lambda: [], nlp=nlp)
|
||||
> ```
|
||||
>
|
||||
> ```ini
|
||||
> ### config.cfg
|
||||
> [initialize.components.morphologizer]
|
||||
>
|
||||
> [initialize.components.morphologizer.labels]
|
||||
> @readers = "spacy.read_labels.v1"
|
||||
> path = "corpus/labels/morphologizer.json
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
||||
|
||||
## Morphologizer.predict {#predict tag="method"}
|
||||
|
||||
|
|
|
@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## Pipe.initialize {#initialize tag="method"}
|
||||
## Pipe.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the component for training. `get_examples` should be a function that
|
||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||
|
|
|
@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## Tagger.initialize {#initialize tag="method"}
|
||||
## Tagger.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the component for training. `get_examples` should be a function that
|
||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||
|
@ -121,7 +121,10 @@ training data or a representative sample. Initialization includes validating the
|
|||
network,
|
||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data. This method is typically called
|
||||
by [`Language.initialize`](/api/language#initialize).
|
||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||
arguments it receives via the
|
||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||
config.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||
|
||||
|
@ -135,12 +138,22 @@ This method was previously called `begin_training`.
|
|||
> tagger = nlp.add_pipe("tagger")
|
||||
> tagger.initialize(lambda: [], nlp=nlp)
|
||||
> ```
|
||||
>
|
||||
> ```ini
|
||||
> ### config.cfg
|
||||
> [initialize.components.tagger]
|
||||
>
|
||||
> [initialize.components.tagger.labels]
|
||||
> @readers = "spacy.read_labels.v1"
|
||||
> path = "corpus/labels/tagger.json
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[list]~~ |
|
||||
|
||||
## Tagger.predict {#predict tag="method"}
|
||||
|
||||
|
|
|
@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## TextCategorizer.initialize {#initialize tag="method"}
|
||||
## TextCategorizer.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the component for training. `get_examples` should be a function that
|
||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||
|
@ -134,7 +134,10 @@ training data or a representative sample. Initialization includes validating the
|
|||
network,
|
||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data. This method is typically called
|
||||
by [`Language.initialize`](/api/language#initialize).
|
||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||
arguments it receives via the
|
||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||
config.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||
|
||||
|
@ -148,12 +151,22 @@ This method was previously called `begin_training`.
|
|||
> textcat = nlp.add_pipe("textcat")
|
||||
> textcat.initialize(lambda: [], nlp=nlp)
|
||||
> ```
|
||||
>
|
||||
> ```ini
|
||||
> ### config.cfg
|
||||
> [initialize.components.textcat]
|
||||
>
|
||||
> [initialize.components.textcat.labels]
|
||||
> @readers = "spacy.read_labels.v1"
|
||||
> path = "corpus/labels/textcat.json
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
||||
|
||||
## TextCategorizer.predict {#predict tag="method"}
|
||||
|
||||
|
|
|
@ -538,6 +538,32 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
|
|||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||
| **CREATES** | The corpus reader. ~~JsonlTexts~~ |
|
||||
|
||||
### spacy.read_labels.v1 {#read_labels tag="registered function"}
|
||||
|
||||
Read a JSON-formatted labels file generated with
|
||||
[`init labels`](/api/cli#init-labels). Typically used in the
|
||||
[`[initialize]`](/api/data-formats#config-initialize) block of the training
|
||||
config to speed up the model initialization process and provide pre-generated
|
||||
label sets.
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [initialize.components]
|
||||
>
|
||||
> [initialize.components.ner]
|
||||
>
|
||||
> [initialize.components.ner.labels]
|
||||
> @readers = "spacy.read_labels.v1"
|
||||
> path = "corpus/labels/ner.json"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~ |
|
||||
| `require` | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
|
||||
| **CREATES** | The |
|
||||
|
||||
## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
|
||||
|
||||
A data batcher implements a batching strategy that essentially turns a stream of
|
||||
|
|
|
@ -585,8 +585,9 @@ vectors, but combines them via summation with a smaller table of learned
|
|||
embeddings.
|
||||
|
||||
```python
|
||||
from thinc.api import add, chain, remap_ids, Embed, FeatureExtractor
|
||||
from thinc.api import add, chain, remap_ids, Embed
|
||||
from spacy.ml.staticvectors import StaticVectors
|
||||
from spacy.ml.featureextractor import FeatureExtractor
|
||||
from spacy.util import registry
|
||||
|
||||
@registry.architectures("my_example.MyEmbedding.v1")
|
||||
|
|
|
@ -204,7 +204,19 @@ initialize it.
|
|||
|
||||
![Illustration of pipeline lifecycle](../images/lifecycle.svg)
|
||||
|
||||
<!-- TODO: explain lifecycle and initialization -->
|
||||
At runtime spaCy will only use the `[nlp]` and `[components]` blocks of the
|
||||
config and load all data, including tokenization rules, model weights and other
|
||||
resources from the pipeline directory. The `[training]` block contains the
|
||||
settings for training the model and is only used during training. Similarly, the
|
||||
`[initialize]` block defines how the initial `nlp` object should be set up
|
||||
before training and whether it should be initialized with vectors or pretrained
|
||||
tok2vec weights, or any other data needed by the components.
|
||||
|
||||
The initialization settings are only loaded and used when
|
||||
[`nlp.initialize`](/api/language#initialize) is called (typically right before
|
||||
training). This allows you to set up your pipeline using local data resources
|
||||
and custom functions, and preserve the information in your config – but without
|
||||
requiring it to be available at runtime
|
||||
|
||||
### Overwriting config settings on the command line {#config-overrides}
|
||||
|
||||
|
@ -803,6 +815,10 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
|||
return create_model(output_width)
|
||||
```
|
||||
|
||||
<!-- TODO:
|
||||
### Customizing the initialization {#initialization}
|
||||
-->
|
||||
|
||||
## Data utilities {#data}
|
||||
|
||||
spaCy includes various features and utilities to make it easy to train models
|
||||
|
@ -853,7 +869,7 @@ nlp = spacy.blank("en")
|
|||
docbin = DocBin(nlp.vocab)
|
||||
words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
|
||||
spaces = [True, True, True, True, True, True, True, False]
|
||||
ents = [("ORG", 0, 1), ("GPE", 5, 6)]
|
||||
ents = ["B-ORG", "O", "O", "O", "O", "B-GPE", "O", "O"]
|
||||
doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
|
||||
docbin.add(doc)
|
||||
docbin.to_disk("./train.spacy")
|
||||
|
|
|
@ -104,7 +104,6 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
|
|||
>
|
||||
> ```ini
|
||||
> [training]
|
||||
> vectors = null
|
||||
> accumulate_gradient = 3
|
||||
>
|
||||
> [training.optimizer]
|
||||
|
@ -430,6 +429,8 @@ The following methods, attributes and commands are new in spaCy v3.0.
|
|||
| [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a pipeline's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). |
|
||||
| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. |
|
||||
| [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). |
|
||||
| [`init vectors`](/api/cli#init-vectors) | Convert word vectors for use with spaCy. |
|
||||
| [`init labels`](/api/cli#init-labels) | Generate JSON files for the labels in the data to speed up training. |
|
||||
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
|
||||
| [`ray`](/api/cli#ray) | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. |
|
||||
|
||||
|
|
|
@ -1,6 +1,11 @@
|
|||
const autoprefixer = require('autoprefixer')
|
||||
const path = require('path')
|
||||
|
||||
// https://florian.ec/blog/gatsby-build-netlify-segmentation-fault/
|
||||
const sharp = require('sharp')
|
||||
sharp.cache(false)
|
||||
sharp.simd(false)
|
||||
|
||||
// Markdown plugins
|
||||
const wrapSectionPlugin = require('./src/plugins/remark-wrap-section.js')
|
||||
const customAttrsPlugin = require('./src/plugins/remark-custom-attrs.js')
|
||||
|
|
Loading…
Reference in New Issue
Block a user