mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Merge
This commit is contained in:
commit
75a1569908
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a26"
|
__version__ = "3.0.0a28"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -7,6 +7,7 @@ import srsly
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..training.initialize import init_nlp, convert_vectors
|
from ..training.initialize import init_nlp, convert_vectors
|
||||||
|
from ..language import Language
|
||||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code, setup_gpu
|
from ._util import import_code, setup_gpu
|
||||||
|
|
||||||
|
@ -19,9 +20,9 @@ def init_vectors_cli(
|
||||||
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
||||||
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
||||||
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||||
jsonl_loc: Optional[Path]=Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file"),
|
|
||||||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
||||||
|
@ -32,12 +33,7 @@ def init_vectors_cli(
|
||||||
msg.info(f"Creating blank nlp object for language '{lang}'")
|
msg.info(f"Creating blank nlp object for language '{lang}'")
|
||||||
nlp = util.get_lang_class(lang)()
|
nlp = util.get_lang_class(lang)()
|
||||||
if jsonl_loc is not None:
|
if jsonl_loc is not None:
|
||||||
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
update_lexemes(nlp, jsonl_loc)
|
||||||
for attrs in lex_attrs:
|
|
||||||
if "settings" in attrs:
|
|
||||||
continue
|
|
||||||
lexeme = nlp.vocab[attrs["orth"]]
|
|
||||||
lexeme.set_attrs(**attrs)
|
|
||||||
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
||||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||||
nlp.to_disk(output_dir)
|
nlp.to_disk(output_dir)
|
||||||
|
@ -48,6 +44,16 @@ def init_vectors_cli(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
|
||||||
|
# Mostly used for backwards-compatibility and may be removed in the future
|
||||||
|
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
||||||
|
for attrs in lex_attrs:
|
||||||
|
if "settings" in attrs:
|
||||||
|
continue
|
||||||
|
lexeme = nlp.vocab[attrs["orth"]]
|
||||||
|
lexeme.set_attrs(**attrs)
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command(
|
@init_cli.command(
|
||||||
"nlp",
|
"nlp",
|
||||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
@ -89,7 +95,7 @@ def init_labels_cli(
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Generate a JSON file for labels in the data. This helps speed up the
|
"""Generate JSON files for the labels in the data. This helps speed up the
|
||||||
training process, since spaCy won't have to preprocess the data to
|
training process, since spaCy won't have to preprocess the data to
|
||||||
extract the labels."""
|
extract the labels."""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
train = null
|
train = null
|
||||||
dev = null
|
dev = null
|
||||||
vectors = null
|
vectors = null
|
||||||
vocab_data = null
|
|
||||||
init_tok2vec = null
|
init_tok2vec = null
|
||||||
|
|
||||||
[system]
|
[system]
|
||||||
|
@ -11,8 +10,13 @@ gpu_allocator = null
|
||||||
|
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = null
|
lang = null
|
||||||
|
# List of pipeline component names, in order. The names should correspond to
|
||||||
|
# components defined in the [components block]
|
||||||
pipeline = []
|
pipeline = []
|
||||||
|
# Components that are loaded but disabled by default
|
||||||
disabled = []
|
disabled = []
|
||||||
|
# Optional callbacks to modify the nlp object before it's initialized, after
|
||||||
|
# it's created and after the pipeline has been set up
|
||||||
before_creation = null
|
before_creation = null
|
||||||
after_creation = null
|
after_creation = null
|
||||||
after_pipeline_creation = null
|
after_pipeline_creation = null
|
||||||
|
@ -20,6 +24,7 @@ after_pipeline_creation = null
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.Tokenizer.v1"
|
@tokenizers = "spacy.Tokenizer.v1"
|
||||||
|
|
||||||
|
# The pipeline components and their models
|
||||||
[components]
|
[components]
|
||||||
|
|
||||||
# Readers for corpora like dev and train.
|
# Readers for corpora like dev and train.
|
||||||
|
@ -38,8 +43,7 @@ max_length = 0
|
||||||
limit = 0
|
limit = 0
|
||||||
# Apply some simply data augmentation, where we replace tokens with variations.
|
# Apply some simply data augmentation, where we replace tokens with variations.
|
||||||
# This is especially useful for punctuation and case replacement, to help
|
# This is especially useful for punctuation and case replacement, to help
|
||||||
# generalize beyond corpora that don't have smart-quotes, or only have smart
|
# generalize beyond corpora that don't/only have smart quotes etc.
|
||||||
# quotes, etc.
|
|
||||||
augmenter = null
|
augmenter = null
|
||||||
|
|
||||||
[corpora.dev]
|
[corpora.dev]
|
||||||
|
@ -53,6 +57,7 @@ gold_preproc = false
|
||||||
max_length = 0
|
max_length = 0
|
||||||
# Limitation on number of training examples
|
# Limitation on number of training examples
|
||||||
limit = 0
|
limit = 0
|
||||||
|
# Optional callback for data augmentation
|
||||||
augmenter = null
|
augmenter = null
|
||||||
|
|
||||||
# Training hyper-parameters and additional features.
|
# Training hyper-parameters and additional features.
|
||||||
|
@ -102,17 +107,18 @@ use_averages = false
|
||||||
eps = 1e-8
|
eps = 1e-8
|
||||||
learn_rate = 0.001
|
learn_rate = 0.001
|
||||||
|
|
||||||
# The 'initialize' step is run before training or pretraining. Components and
|
# These settings are used when nlp.initialize() is called (typically before
|
||||||
# the tokenizer can each define their own arguments via their .initialize
|
# training or pretraining). Components and the tokenizer can each define their
|
||||||
# methods that are populated by the config. This lets them gather resources like
|
# own arguments via their initialize methods that are populated by the config.
|
||||||
# lookup tables and build label sets, construct vocabularies, etc.
|
# This lets them gather data resources, build label sets etc.
|
||||||
[initialize]
|
[initialize]
|
||||||
vocab_data = ${paths.vocab_data}
|
|
||||||
lookups = null
|
|
||||||
vectors = ${paths.vectors}
|
vectors = ${paths.vectors}
|
||||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
# Extra resources for transfer-learning or pseudo-rehearsal
|
||||||
init_tok2vec = ${paths.init_tok2vec}
|
init_tok2vec = ${paths.init_tok2vec}
|
||||||
|
# Data and lookups for vocabulary
|
||||||
|
vocab_data = null
|
||||||
|
lookups = null
|
||||||
# Arguments passed to the tokenizer's initialize method
|
# Arguments passed to the tokenizer's initialize method
|
||||||
tokenizer = {}
|
tokenizer = {}
|
||||||
# Arguments passed to the initialize methods of the components (keyed by component name)
|
# Arguments for initialize methods of the components (keyed by component)
|
||||||
components = {}
|
components = {}
|
||||||
|
|
|
@ -710,6 +710,9 @@ class Errors:
|
||||||
"options: {modes}")
|
"options: {modes}")
|
||||||
E1012 = ("Entity spans and blocked/missing/outside spans should be "
|
E1012 = ("Entity spans and blocked/missing/outside spans should be "
|
||||||
"provided to doc.set_ents as lists of `Span` objects.")
|
"provided to doc.set_ents as lists of `Span` objects.")
|
||||||
|
E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
|
||||||
|
"token itself. To set the morph from this MorphAnalysis, set from "
|
||||||
|
"the string value with: `token.set_morph(str(other_morph))`.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import load_config_from_str
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[initialize]
|
|
||||||
|
|
||||||
[initialize.lookups]
|
|
||||||
@misc = "spacy.LookupsDataLoader.v1"
|
|
||||||
lang = ${nlp.lang}
|
|
||||||
tables = ["lexeme_norm"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class DanishDefaults(Language.Defaults):
|
class DanishDefaults(Language.Defaults):
|
||||||
config = load_config_from_str(DEFAULT_CONFIG)
|
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import load_config_from_str
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[initialize]
|
|
||||||
|
|
||||||
[initialize.lookups]
|
|
||||||
@misc = "spacy.LookupsDataLoader.v1"
|
|
||||||
lang = ${nlp.lang}
|
|
||||||
tables = ["lexeme_norm"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class GermanDefaults(Language.Defaults):
|
class GermanDefaults(Language.Defaults):
|
||||||
config = load_config_from_str(DEFAULT_CONFIG)
|
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -9,21 +9,9 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
|
||||||
from .lemmatizer import GreekLemmatizer
|
from .lemmatizer import GreekLemmatizer
|
||||||
from ...lookups import Lookups
|
from ...lookups import Lookups
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import load_config_from_str
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[initialize]
|
|
||||||
|
|
||||||
[initialize.lookups]
|
|
||||||
@misc = "spacy.LookupsDataLoader.v1"
|
|
||||||
lang = ${nlp.lang}
|
|
||||||
tables = ["lexeme_norm"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class GreekDefaults(Language.Defaults):
|
class GreekDefaults(Language.Defaults):
|
||||||
config = load_config_from_str(DEFAULT_CONFIG)
|
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -4,21 +4,9 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import load_config_from_str
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[initialize]
|
|
||||||
|
|
||||||
[initialize.lookups]
|
|
||||||
@misc = "spacy.LookupsDataLoader.v1"
|
|
||||||
lang = ${nlp.lang}
|
|
||||||
tables = ["lexeme_norm"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class IndonesianDefaults(Language.Defaults):
|
class IndonesianDefaults(Language.Defaults):
|
||||||
config = load_config_from_str(DEFAULT_CONFIG)
|
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_INFIXES
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import load_config_from_str
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[initialize]
|
|
||||||
|
|
||||||
[initialize.lookups]
|
|
||||||
@misc = "spacy.LookupsDataLoader.v1"
|
|
||||||
lang = ${nlp.lang}
|
|
||||||
tables = ["lexeme_norm"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class LuxembourgishDefaults(Language.Defaults):
|
class LuxembourgishDefaults(Language.Defaults):
|
||||||
config = load_config_from_str(DEFAULT_CONFIG)
|
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
|
|
@ -3,21 +3,9 @@ from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import load_config_from_str
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[initialize]
|
|
||||||
|
|
||||||
[initialize.lookups]
|
|
||||||
@misc = "spacy.LookupsDataLoader.v1"
|
|
||||||
lang = ${nlp.lang}
|
|
||||||
tables = ["lexeme_norm"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class PortugueseDefaults(Language.Defaults):
|
class PortugueseDefaults(Language.Defaults):
|
||||||
config = load_config_from_str(DEFAULT_CONFIG)
|
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
|
|
@ -7,21 +7,9 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import RussianLemmatizer
|
from .lemmatizer import RussianLemmatizer
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
from ...lookups import Lookups
|
||||||
from ...util import load_config_from_str
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[initialize]
|
|
||||||
|
|
||||||
[initialize.lookups]
|
|
||||||
@misc = "spacy.LookupsDataLoader.v1"
|
|
||||||
lang = ${nlp.lang}
|
|
||||||
tables = ["lexeme_norm"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class RussianDefaults(Language.Defaults):
|
class RussianDefaults(Language.Defaults):
|
||||||
config = load_config_from_str(DEFAULT_CONFIG)
|
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
|
@ -2,21 +2,9 @@ from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import load_config_from_str
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[initialize]
|
|
||||||
|
|
||||||
[initialize.lookups]
|
|
||||||
@misc = "spacy.LookupsDataLoader.v1"
|
|
||||||
lang = ${nlp.lang}
|
|
||||||
tables = ["lexeme_norm"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class SerbianDefaults(Language.Defaults):
|
class SerbianDefaults(Language.Defaults):
|
||||||
config = load_config_from_str(DEFAULT_CONFIG)
|
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
|
@ -1,21 +1,9 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import load_config_from_str
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
|
||||||
[initialize]
|
|
||||||
|
|
||||||
[initialize.lookups]
|
|
||||||
@misc = "spacy.LookupsDataLoader.v1"
|
|
||||||
lang = ${nlp.lang}
|
|
||||||
tables = ["lexeme_norm"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class TamilDefaults(Language.Defaults):
|
class TamilDefaults(Language.Defaults):
|
||||||
config = load_config_from_str(DEFAULT_CONFIG)
|
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -10,13 +10,6 @@ DEFAULT_CONFIG = """
|
||||||
|
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.th.ThaiTokenizer"
|
@tokenizers = "spacy.th.ThaiTokenizer"
|
||||||
|
|
||||||
[initialize]
|
|
||||||
|
|
||||||
[initialize.lookups]
|
|
||||||
@misc = "spacy.LookupsDataLoader.v1"
|
|
||||||
lang = ${nlp.lang}
|
|
||||||
tables = ["lexeme_norm"]
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
25
spacy/ml/featureextractor.py
Normal file
25
spacy/ml/featureextractor.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
from typing import List, Union, Callable, Tuple
|
||||||
|
from thinc.types import Ints2d, Doc
|
||||||
|
from thinc.api import Model, registry
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers("spacy.FeatureExtractor.v1")
|
||||||
|
def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
|
||||||
|
return Model("extract_features", forward, attrs={"columns": columns})
|
||||||
|
|
||||||
|
|
||||||
|
def forward(model: Model[List[Doc], List[Ints2d]], docs, is_train: bool) -> Tuple[List[Ints2d], Callable]:
|
||||||
|
columns = model.attrs["columns"]
|
||||||
|
features: List[Ints2d] = []
|
||||||
|
for doc in docs:
|
||||||
|
if hasattr(doc, "to_array"):
|
||||||
|
attrs = doc.to_array(columns)
|
||||||
|
else:
|
||||||
|
attrs = doc.doc.to_array(columns)[doc.start : doc.end]
|
||||||
|
if attrs.ndim == 1:
|
||||||
|
attrs = attrs.reshape((attrs.shape[0], 1))
|
||||||
|
features.append(model.ops.asarray2i(attrs, dtype="uint64"))
|
||||||
|
|
||||||
|
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
|
||||||
|
return features, backprop
|
|
@ -3,12 +3,13 @@ from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||||
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
|
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
|
||||||
from thinc.api import Relu, residual, expand_window, FeatureExtractor
|
from thinc.api import Relu, residual, expand_window
|
||||||
|
|
||||||
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ..extract_ngrams import extract_ngrams
|
from ..extract_ngrams import extract_ngrams
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
|
from ..featureextractor import FeatureExtractor
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatCNN.v1")
|
@registry.architectures.register("spacy.TextCatCNN.v1")
|
||||||
|
|
|
@ -1,16 +1,16 @@
|
||||||
from typing import Optional, List, Union
|
from typing import Optional, List, Union
|
||||||
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
|
||||||
from thinc.api import Model, noop, list2ragged, ragged2list
|
|
||||||
from thinc.api import FeatureExtractor, HashEmbed
|
|
||||||
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
||||||
|
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
|
||||||
|
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
||||||
|
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ...ml import _character_embed
|
from ...ml import _character_embed
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
|
from ..featureextractor import FeatureExtractor
|
||||||
from ...pipeline.tok2vec import Tok2VecListener
|
from ...pipeline.tok2vec import Tok2VecListener
|
||||||
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
||||||
|
@ -98,7 +98,7 @@ def MultiHashEmbed(
|
||||||
attributes using hash embedding, concatenates the results, and passes it
|
attributes using hash embedding, concatenates the results, and passes it
|
||||||
through a feed-forward subnetwork to build a mixed representations.
|
through a feed-forward subnetwork to build a mixed representations.
|
||||||
|
|
||||||
The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have
|
The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
|
||||||
varying definitions depending on the Vocab of the Doc object passed in.
|
varying definitions depending on the Vocab of the Doc object passed in.
|
||||||
Vectors from pretrained static vectors can also be incorporated into the
|
Vectors from pretrained static vectors can also be incorporated into the
|
||||||
concatenated representation.
|
concatenated representation.
|
||||||
|
@ -115,7 +115,7 @@ def MultiHashEmbed(
|
||||||
also_use_static_vectors (bool): Whether to also use static word vectors.
|
also_use_static_vectors (bool): Whether to also use static word vectors.
|
||||||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||||
"""
|
"""
|
||||||
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
seed = 7
|
seed = 7
|
||||||
|
|
||||||
def make_hash_embed(feature):
|
def make_hash_embed(feature):
|
||||||
|
@ -123,7 +123,7 @@ def MultiHashEmbed(
|
||||||
seed += 1
|
seed += 1
|
||||||
return HashEmbed(
|
return HashEmbed(
|
||||||
width,
|
width,
|
||||||
rows if feature == NORM else rows // 2,
|
rows if feature == LOWER else rows // 2,
|
||||||
column=cols.index(feature),
|
column=cols.index(feature),
|
||||||
seed=seed,
|
seed=seed,
|
||||||
dropout=0.0,
|
dropout=0.0,
|
||||||
|
@ -131,13 +131,13 @@ def MultiHashEmbed(
|
||||||
|
|
||||||
if also_embed_subwords:
|
if also_embed_subwords:
|
||||||
embeddings = [
|
embeddings = [
|
||||||
make_hash_embed(NORM),
|
make_hash_embed(LOWER),
|
||||||
make_hash_embed(PREFIX),
|
make_hash_embed(PREFIX),
|
||||||
make_hash_embed(SUFFIX),
|
make_hash_embed(SUFFIX),
|
||||||
make_hash_embed(SHAPE),
|
make_hash_embed(SHAPE),
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
embeddings = [make_hash_embed(NORM)]
|
embeddings = [make_hash_embed(LOWER)]
|
||||||
concat_size = width * (len(embeddings) + also_use_static_vectors)
|
concat_size = width * (len(embeddings) + also_use_static_vectors)
|
||||||
if also_use_static_vectors:
|
if also_use_static_vectors:
|
||||||
model = chain(
|
model = chain(
|
||||||
|
@ -180,13 +180,17 @@ def CharacterEmbed(
|
||||||
of being in an arbitrary position depending on the word length.
|
of being in an arbitrary position depending on the word length.
|
||||||
|
|
||||||
The characters are embedded in a embedding table with a given number of rows,
|
The characters are embedded in a embedding table with a given number of rows,
|
||||||
and the vectors concatenated. A hash-embedded vector of the NORM of the word is
|
and the vectors concatenated. A hash-embedded vector of the LOWER of the word is
|
||||||
also concatenated on, and the result is then passed through a feed-forward
|
also concatenated on, and the result is then passed through a feed-forward
|
||||||
network to construct a single vector to represent the information.
|
network to construct a single vector to represent the information.
|
||||||
|
|
||||||
feature (int or str): An attribute to embed, to concatenate with the characters.
|
feature (int or str): An attribute to embed, to concatenate with the characters.
|
||||||
width (int): The width of the output vector and the feature embedding.
|
width (int): The width of the output vector and the feature embedding.
|
||||||
|
<<<<<<< HEAD
|
||||||
rows (int): The number of rows in the NORM hash embedding table.
|
rows (int): The number of rows in the NORM hash embedding table.
|
||||||
|
=======
|
||||||
|
rows (int): The number of rows in the LOWER hash embedding table.
|
||||||
|
>>>>>>> 300e5a9928fd226dfddbf7d5c22558f696bfa1af
|
||||||
nM (int): The dimensionality of the character embeddings. Recommended values
|
nM (int): The dimensionality of the character embeddings. Recommended values
|
||||||
are between 16 and 64.
|
are between 16 and 64.
|
||||||
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
||||||
|
|
|
@ -149,7 +149,7 @@ class Morphologizer(Tagger):
|
||||||
for example in get_examples():
|
for example in get_examples():
|
||||||
for i, token in enumerate(example.reference):
|
for i, token in enumerate(example.reference):
|
||||||
pos = token.pos_
|
pos = token.pos_
|
||||||
morph = token.morph_
|
morph = str(token.morph)
|
||||||
# create and add the combined morph+POS label
|
# create and add the combined morph+POS label
|
||||||
morph_dict = Morphology.feats_to_dict(morph)
|
morph_dict = Morphology.feats_to_dict(morph)
|
||||||
if pos:
|
if pos:
|
||||||
|
@ -167,7 +167,7 @@ class Morphologizer(Tagger):
|
||||||
gold_array = []
|
gold_array = []
|
||||||
for i, token in enumerate(example.reference):
|
for i, token in enumerate(example.reference):
|
||||||
pos = token.pos_
|
pos = token.pos_
|
||||||
morph = token.morph_
|
morph = str(token.morph)
|
||||||
morph_dict = Morphology.feats_to_dict(morph)
|
morph_dict = Morphology.feats_to_dict(morph)
|
||||||
if pos:
|
if pos:
|
||||||
morph_dict[self.POS_FEAT] = pos
|
morph_dict[self.POS_FEAT] = pos
|
||||||
|
|
|
@ -268,6 +268,9 @@ class Tagger(Pipe):
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects..
|
returns a representative sample of gold-standard Example objects..
|
||||||
nlp (Language): The current nlp object the component is part of.
|
nlp (Language): The current nlp object the component is part of.
|
||||||
|
labels: The labels to add to the component, typically generated by the
|
||||||
|
`init labels` command. If no labels are provided, the get_examples
|
||||||
|
callback is used to extract the labels from the data.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tagger#initialize
|
DOCS: https://nightly.spacy.io/api/tagger#initialize
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -355,6 +355,9 @@ class TextCategorizer(Pipe):
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
nlp (Language): The current nlp object the component is part of.
|
nlp (Language): The current nlp object the component is part of.
|
||||||
|
labels: The labels to add to the component, typically generated by the
|
||||||
|
`init labels` command. If no labels are provided, the get_examples
|
||||||
|
callback is used to extract the labels from the data.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
|
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -46,9 +46,9 @@ def test_doc_array_morph(en_vocab):
|
||||||
words = ["Eat", "blue", "ham"]
|
words = ["Eat", "blue", "ham"]
|
||||||
morph = ["Feat=V", "Feat=J", "Feat=N"]
|
morph = ["Feat=V", "Feat=J", "Feat=N"]
|
||||||
doc = Doc(en_vocab, words=words, morphs=morph)
|
doc = Doc(en_vocab, words=words, morphs=morph)
|
||||||
assert morph[0] == doc[0].morph_
|
assert morph[0] == str(doc[0].morph)
|
||||||
assert morph[1] == doc[1].morph_
|
assert morph[1] == str(doc[1].morph)
|
||||||
assert morph[2] == doc[2].morph_
|
assert morph[2] == str(doc[2].morph)
|
||||||
|
|
||||||
feats_array = doc.to_array((ORTH, MORPH))
|
feats_array = doc.to_array((ORTH, MORPH))
|
||||||
assert feats_array[0][1] == doc[0].morph.key
|
assert feats_array[0][1] == doc[0].morph.key
|
||||||
|
|
|
@ -319,15 +319,13 @@ def test_doc_from_array_morph(en_vocab):
|
||||||
words = ["I", "live", "in", "New", "York", "."]
|
words = ["I", "live", "in", "New", "York", "."]
|
||||||
morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
|
morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
doc = Doc(en_vocab, words=words)
|
doc = Doc(en_vocab, words=words, morphs=morphs)
|
||||||
for i, morph in enumerate(morphs):
|
|
||||||
doc[i].morph_ = morph
|
|
||||||
attrs = [MORPH]
|
attrs = [MORPH]
|
||||||
arr = doc.to_array(attrs)
|
arr = doc.to_array(attrs)
|
||||||
new_doc = Doc(en_vocab, words=words)
|
new_doc = Doc(en_vocab, words=words)
|
||||||
new_doc.from_array(attrs, arr)
|
new_doc.from_array(attrs, arr)
|
||||||
assert [t.morph_ for t in new_doc] == morphs
|
assert [str(t.morph) for t in new_doc] == morphs
|
||||||
assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc]
|
assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc]
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
|
@ -423,7 +421,7 @@ def test_has_annotation(en_vocab):
|
||||||
|
|
||||||
doc[0].tag_ = "A"
|
doc[0].tag_ = "A"
|
||||||
doc[0].pos_ = "X"
|
doc[0].pos_ = "X"
|
||||||
doc[0].morph_ = "Feat=Val"
|
doc[0].set_morph("Feat=Val")
|
||||||
doc[0].lemma_ = "a"
|
doc[0].lemma_ = "a"
|
||||||
doc[0].dep_ = "dep"
|
doc[0].dep_ = "dep"
|
||||||
doc[0].head = doc[1]
|
doc[0].head = doc[1]
|
||||||
|
@ -435,7 +433,7 @@ def test_has_annotation(en_vocab):
|
||||||
|
|
||||||
doc[1].tag_ = "A"
|
doc[1].tag_ = "A"
|
||||||
doc[1].pos_ = "X"
|
doc[1].pos_ = "X"
|
||||||
doc[1].morph_ = ""
|
doc[1].set_morph("")
|
||||||
doc[1].lemma_ = "a"
|
doc[1].lemma_ = "a"
|
||||||
doc[1].dep_ = "dep"
|
doc[1].dep_ = "dep"
|
||||||
doc.ents = [Span(doc, 0, 2, label="HELLO")]
|
doc.ents = [Span(doc, 0, 2, label="HELLO")]
|
||||||
|
@ -533,5 +531,78 @@ def test_doc_ents_setter():
|
||||||
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
|
ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
|
||||||
|
ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"]
|
||||||
doc = Doc(vocab, words=words, ents=ents)
|
doc = Doc(vocab, words=words, ents=ents)
|
||||||
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_morph_setter(en_tokenizer, de_tokenizer):
|
||||||
|
doc1 = en_tokenizer("a b")
|
||||||
|
doc1b = en_tokenizer("c d")
|
||||||
|
doc2 = de_tokenizer("a b")
|
||||||
|
|
||||||
|
# unset values can be copied
|
||||||
|
doc1[0].morph = doc1[1].morph
|
||||||
|
assert doc1[0].morph.key == 0
|
||||||
|
assert doc1[1].morph.key == 0
|
||||||
|
|
||||||
|
# morph values from the same vocab can be copied
|
||||||
|
doc1[0].set_morph("Feat=Val")
|
||||||
|
doc1[1].morph = doc1[0].morph
|
||||||
|
assert doc1[0].morph == doc1[1].morph
|
||||||
|
|
||||||
|
# ... also across docs
|
||||||
|
doc1b[0].morph = doc1[0].morph
|
||||||
|
assert doc1[0].morph == doc1b[0].morph
|
||||||
|
|
||||||
|
doc2[0].set_morph("Feat2=Val2")
|
||||||
|
|
||||||
|
# the morph value must come from the same vocab
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc1[0].morph = doc2[0].morph
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_init_iob():
|
||||||
|
"""Test ents validation/normalization in Doc.__init__"""
|
||||||
|
words = ["a", "b", "c", "d", "e"]
|
||||||
|
ents = ["O"] * len(words)
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert doc.ents == ()
|
||||||
|
|
||||||
|
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-PERSON"]
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert len(doc.ents) == 2
|
||||||
|
|
||||||
|
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert len(doc.ents) == 3
|
||||||
|
|
||||||
|
# None is missing
|
||||||
|
ents = ["B-PERSON", "I-PERSON", "O", None, "I-GPE"]
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert len(doc.ents) == 2
|
||||||
|
|
||||||
|
# empty tag is missing
|
||||||
|
ents = ["", "B-PERSON", "O", "B-PERSON", "I-PERSON"]
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
assert len(doc.ents) == 2
|
||||||
|
|
||||||
|
# invalid IOB
|
||||||
|
ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
|
||||||
|
# no dash
|
||||||
|
ents = ["OPERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
|
||||||
|
# no ent type
|
||||||
|
ents = ["O", "B-", "O", "I-PERSON", "I-GPE"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
|
||||||
|
# not strings or None
|
||||||
|
ents = [0, "B-", "O", "I-PERSON", "I-GPE"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = Doc(Vocab(), words=words, ents=ents)
|
||||||
|
|
|
@ -4,13 +4,13 @@ import pytest
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def i_has(en_tokenizer):
|
def i_has(en_tokenizer):
|
||||||
doc = en_tokenizer("I has")
|
doc = en_tokenizer("I has")
|
||||||
doc[0].morph_ = {"PronType": "prs"}
|
doc[0].set_morph({"PronType": "prs"})
|
||||||
doc[1].morph_ = {
|
doc[1].set_morph({
|
||||||
"VerbForm": "fin",
|
"VerbForm": "fin",
|
||||||
"Tense": "pres",
|
"Tense": "pres",
|
||||||
"Number": "sing",
|
"Number": "sing",
|
||||||
"Person": "three",
|
"Person": "three",
|
||||||
}
|
})
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
@ -47,20 +47,20 @@ def test_morph_get(i_has):
|
||||||
def test_morph_set(i_has):
|
def test_morph_set(i_has):
|
||||||
assert i_has[0].morph.get("PronType") == ["prs"]
|
assert i_has[0].morph.get("PronType") == ["prs"]
|
||||||
# set by string
|
# set by string
|
||||||
i_has[0].morph_ = "PronType=unk"
|
i_has[0].set_morph("PronType=unk")
|
||||||
assert i_has[0].morph.get("PronType") == ["unk"]
|
assert i_has[0].morph.get("PronType") == ["unk"]
|
||||||
# set by string, fields are alphabetized
|
# set by string, fields are alphabetized
|
||||||
i_has[0].morph_ = "PronType=123|NounType=unk"
|
i_has[0].set_morph("PronType=123|NounType=unk")
|
||||||
assert i_has[0].morph_ == "NounType=unk|PronType=123"
|
assert str(i_has[0].morph) == "NounType=unk|PronType=123"
|
||||||
# set by dict
|
# set by dict
|
||||||
i_has[0].morph_ = {"AType": "123", "BType": "unk"}
|
i_has[0].set_morph({"AType": "123", "BType": "unk"})
|
||||||
assert i_has[0].morph_ == "AType=123|BType=unk"
|
assert str(i_has[0].morph) == "AType=123|BType=unk"
|
||||||
# set by string with multiple values, fields and values are alphabetized
|
# set by string with multiple values, fields and values are alphabetized
|
||||||
i_has[0].morph_ = "BType=c|AType=b,a"
|
i_has[0].set_morph("BType=c|AType=b,a")
|
||||||
assert i_has[0].morph_ == "AType=a,b|BType=c"
|
assert str(i_has[0].morph) == "AType=a,b|BType=c"
|
||||||
# set by dict with multiple values, fields and values are alphabetized
|
# set by dict with multiple values, fields and values are alphabetized
|
||||||
i_has[0].morph_ = {"AType": "b,a", "BType": "c"}
|
i_has[0].set_morph({"AType": "b,a", "BType": "c"})
|
||||||
assert i_has[0].morph_ == "AType=a,b|BType=c"
|
assert str(i_has[0].morph) == "AType=a,b|BType=c"
|
||||||
|
|
||||||
|
|
||||||
def test_morph_str(i_has):
|
def test_morph_str(i_has):
|
||||||
|
@ -72,25 +72,25 @@ def test_morph_property(tokenizer):
|
||||||
doc = tokenizer("a dog")
|
doc = tokenizer("a dog")
|
||||||
|
|
||||||
# set through token.morph_
|
# set through token.morph_
|
||||||
doc[0].morph_ = "PronType=prs"
|
doc[0].set_morph("PronType=prs")
|
||||||
assert doc[0].morph_ == "PronType=prs"
|
assert str(doc[0].morph) == "PronType=prs"
|
||||||
assert doc.to_array(["MORPH"])[0] != 0
|
assert doc.to_array(["MORPH"])[0] != 0
|
||||||
|
|
||||||
# unset with token.morph
|
# unset with token.morph
|
||||||
doc[0].morph = 0
|
doc[0].set_morph(0)
|
||||||
assert doc.to_array(["MORPH"])[0] == 0
|
assert doc.to_array(["MORPH"])[0] == 0
|
||||||
|
|
||||||
# empty morph is equivalent to "_"
|
# empty morph is equivalent to "_"
|
||||||
doc[0].morph_ = ""
|
doc[0].set_morph("")
|
||||||
assert doc[0].morph_ == ""
|
assert str(doc[0].morph) == ""
|
||||||
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
|
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
|
||||||
|
|
||||||
# "_" morph is also equivalent to empty morph
|
# "_" morph is also equivalent to empty morph
|
||||||
doc[0].morph_ = "_"
|
doc[0].set_morph("_")
|
||||||
assert doc[0].morph_ == ""
|
assert str(doc[0].morph) == ""
|
||||||
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
|
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
|
||||||
|
|
||||||
# set through existing hash with token.morph
|
# set through existing hash with token.morph
|
||||||
tokenizer.vocab.strings.add("Feat=Val")
|
tokenizer.vocab.strings.add("Feat=Val")
|
||||||
doc[0].morph = tokenizer.vocab.strings.add("Feat=Val")
|
doc[0].set_morph(tokenizer.vocab.strings.add("Feat=Val"))
|
||||||
assert doc[0].morph_ == "Feat=Val"
|
assert str(doc[0].morph) == "Feat=Val"
|
||||||
|
|
|
@ -21,11 +21,11 @@ def test_doc_retokenize_merge(en_tokenizer):
|
||||||
assert doc[4].text == "the beach boys"
|
assert doc[4].text == "the beach boys"
|
||||||
assert doc[4].text_with_ws == "the beach boys "
|
assert doc[4].text_with_ws == "the beach boys "
|
||||||
assert doc[4].tag_ == "NAMED"
|
assert doc[4].tag_ == "NAMED"
|
||||||
assert doc[4].morph_ == "Number=Plur"
|
assert str(doc[4].morph) == "Number=Plur"
|
||||||
assert doc[5].text == "all night"
|
assert doc[5].text == "all night"
|
||||||
assert doc[5].text_with_ws == "all night"
|
assert doc[5].text_with_ws == "all night"
|
||||||
assert doc[5].tag_ == "NAMED"
|
assert doc[5].tag_ == "NAMED"
|
||||||
assert doc[5].morph_ == "Number=Plur"
|
assert str(doc[5].morph) == "Number=Plur"
|
||||||
|
|
||||||
|
|
||||||
def test_doc_retokenize_merge_children(en_tokenizer):
|
def test_doc_retokenize_merge_children(en_tokenizer):
|
||||||
|
@ -201,6 +201,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
|
||||||
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
|
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
|
||||||
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
|
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
|
||||||
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
|
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
|
||||||
|
ents = ["O"] * len(heads)
|
||||||
|
ents[0] = "B-PERSON"
|
||||||
|
ents[1] = "I-PERSON"
|
||||||
|
ents[10] = "B-GPE"
|
||||||
|
ents[13] = "B-PERSON"
|
||||||
|
ents[14] = "I-PERSON"
|
||||||
# fmt: on
|
# fmt: on
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
|
@ -269,7 +275,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
||||||
# if there is a parse, span.root provides default values
|
# if there is a parse, span.root provides default values
|
||||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||||
heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
|
heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
|
||||||
ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)]
|
ents = ["O"] * len(words)
|
||||||
|
ents[3] = "B-ent-de"
|
||||||
|
ents[4] = "I-ent-de"
|
||||||
|
ents[5] = "B-ent-fg"
|
||||||
|
ents[6] = "I-ent-fg"
|
||||||
deps = ["dep"] * len(words)
|
deps = ["dep"] * len(words)
|
||||||
en_vocab.strings.add("ent-de")
|
en_vocab.strings.add("ent-de")
|
||||||
en_vocab.strings.add("ent-fg")
|
en_vocab.strings.add("ent-fg")
|
||||||
|
@ -292,7 +302,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
||||||
# check that B is preserved if span[start] is B
|
# check that B is preserved if span[start] is B
|
||||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||||
heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
|
heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
|
||||||
ents = [("ent-de", 3, 5), ("ent-de", 5, 7)]
|
ents = ["O"] * len(words)
|
||||||
|
ents[3] = "B-ent-de"
|
||||||
|
ents[4] = "I-ent-de"
|
||||||
|
ents[5] = "B-ent-de"
|
||||||
|
ents[6] = "I-ent-de"
|
||||||
deps = ["dep"] * len(words)
|
deps = ["dep"] * len(words)
|
||||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
|
|
|
@ -27,11 +27,11 @@ def test_doc_retokenize_split(en_vocab):
|
||||||
assert doc[0].text == "Los"
|
assert doc[0].text == "Los"
|
||||||
assert doc[0].head.text == "Angeles"
|
assert doc[0].head.text == "Angeles"
|
||||||
assert doc[0].idx == 0
|
assert doc[0].idx == 0
|
||||||
assert doc[0].morph_ == "Number=Sing"
|
assert str(doc[0].morph) == "Number=Sing"
|
||||||
assert doc[1].idx == 3
|
assert doc[1].idx == 3
|
||||||
assert doc[1].text == "Angeles"
|
assert doc[1].text == "Angeles"
|
||||||
assert doc[1].head.text == "start"
|
assert doc[1].head.text == "start"
|
||||||
assert doc[1].morph_ == "Number=Sing"
|
assert str(doc[1].morph) == "Number=Sing"
|
||||||
assert doc[2].text == "start"
|
assert doc[2].text == "start"
|
||||||
assert doc[2].head.text == "."
|
assert doc[2].head.text == "."
|
||||||
assert doc[3].text == "."
|
assert doc[3].text == "."
|
||||||
|
|
|
@ -9,7 +9,7 @@ def doc(en_vocab):
|
||||||
tags = ["VBP", "NN", "NN"]
|
tags = ["VBP", "NN", "NN"]
|
||||||
heads = [0, 0, 0]
|
heads = [0, 0, 0]
|
||||||
deps = ["ROOT", "dobj", "dobj"]
|
deps = ["ROOT", "dobj", "dobj"]
|
||||||
ents = [("ORG", 1, 2)]
|
ents = ["O", "B-ORG", "O"]
|
||||||
return Doc(
|
return Doc(
|
||||||
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
|
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
|
||||||
)
|
)
|
||||||
|
|
|
@ -236,13 +236,13 @@ def test_matcher_subset_value_operator(en_vocab):
|
||||||
matcher.add("M", [pattern])
|
matcher.add("M", [pattern])
|
||||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
assert len(matcher(doc)) == 3
|
assert len(matcher(doc)) == 3
|
||||||
doc[0].morph_ = "Feat=Val"
|
doc[0].set_morph("Feat=Val")
|
||||||
assert len(matcher(doc)) == 3
|
assert len(matcher(doc)) == 3
|
||||||
doc[0].morph_ = "Feat=Val|Feat2=Val2"
|
doc[0].set_morph("Feat=Val|Feat2=Val2")
|
||||||
assert len(matcher(doc)) == 3
|
assert len(matcher(doc)) == 3
|
||||||
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
|
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
|
||||||
assert len(matcher(doc)) == 2
|
assert len(matcher(doc)) == 2
|
||||||
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
|
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
|
||||||
assert len(matcher(doc)) == 2
|
assert len(matcher(doc)) == 2
|
||||||
|
|
||||||
# IS_SUBSET acts like "IN" for attrs other than MORPH
|
# IS_SUBSET acts like "IN" for attrs other than MORPH
|
||||||
|
@ -268,11 +268,11 @@ def test_matcher_superset_value_operator(en_vocab):
|
||||||
matcher.add("M", [pattern])
|
matcher.add("M", [pattern])
|
||||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
doc[0].morph_ = "Feat=Val|Feat2=Val2"
|
doc[0].set_morph("Feat=Val|Feat2=Val2")
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
|
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
|
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
# IS_SUPERSET with more than one value only matches for MORPH
|
# IS_SUPERSET with more than one value only matches for MORPH
|
||||||
|
@ -310,9 +310,9 @@ def test_matcher_morph_handling(en_vocab):
|
||||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
|
|
||||||
doc[0].morph_ = "Feat2=Val2|Feat1=Val1"
|
doc[0].set_morph("Feat2=Val2|Feat1=Val1")
|
||||||
assert len(matcher(doc)) == 2
|
assert len(matcher(doc)) == 2
|
||||||
doc[0].morph_ = "Feat1=Val1|Feat2=Val2"
|
doc[0].set_morph("Feat1=Val1|Feat2=Val2")
|
||||||
assert len(matcher(doc)) == 2
|
assert len(matcher(doc)) == 2
|
||||||
|
|
||||||
# multiple values are split
|
# multiple values are split
|
||||||
|
@ -324,9 +324,9 @@ def test_matcher_morph_handling(en_vocab):
|
||||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
|
|
||||||
doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1"
|
doc[0].set_morph("Feat2=Val2,Val3|Feat1=Val1")
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2"
|
doc[0].set_morph("Feat1=Val1,Val3|Feat2=Val2")
|
||||||
assert len(matcher(doc)) == 2
|
assert len(matcher(doc)) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab):
|
||||||
doc2 = Doc(en_vocab, words=["Test"])
|
doc2 = Doc(en_vocab, words=["Test"])
|
||||||
doc2[0].tag_ = "TAG"
|
doc2[0].tag_ = "TAG"
|
||||||
doc2[0].pos_ = "X"
|
doc2[0].pos_ = "X"
|
||||||
doc2[0].morph_ = "Feat=Val"
|
doc2[0].set_morph("Feat=Val")
|
||||||
doc2[0].lemma_ = "LEMMA"
|
doc2[0].lemma_ = "LEMMA"
|
||||||
doc3 = Doc(en_vocab, words=["Test"])
|
doc3 = Doc(en_vocab, words=["Test"])
|
||||||
# DEP requires DEP
|
# DEP requires DEP
|
||||||
|
|
|
@ -190,7 +190,7 @@ def test_phrase_matcher_validation(en_vocab):
|
||||||
doc2 = Doc(en_vocab, words=["Test"])
|
doc2 = Doc(en_vocab, words=["Test"])
|
||||||
doc2[0].tag_ = "TAG"
|
doc2[0].tag_ = "TAG"
|
||||||
doc2[0].pos_ = "X"
|
doc2[0].pos_ = "X"
|
||||||
doc2[0].morph_ = "Feat=Val"
|
doc2[0].set_morph("Feat=Val")
|
||||||
doc3 = Doc(en_vocab, words=["Test"])
|
doc3 = Doc(en_vocab, words=["Test"])
|
||||||
matcher = PhraseMatcher(en_vocab, validate=True)
|
matcher = PhraseMatcher(en_vocab, validate=True)
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
|
@ -217,7 +217,7 @@ def test_attr_pipeline_checks(en_vocab):
|
||||||
doc2 = Doc(en_vocab, words=["Test"])
|
doc2 = Doc(en_vocab, words=["Test"])
|
||||||
doc2[0].tag_ = "TAG"
|
doc2[0].tag_ = "TAG"
|
||||||
doc2[0].pos_ = "X"
|
doc2[0].pos_ = "X"
|
||||||
doc2[0].morph_ = "Feat=Val"
|
doc2[0].set_morph("Feat=Val")
|
||||||
doc2[0].lemma_ = "LEMMA"
|
doc2[0].lemma_ = "LEMMA"
|
||||||
doc3 = Doc(en_vocab, words=["Test"])
|
doc3 = Doc(en_vocab, words=["Test"])
|
||||||
# DEP requires DEP
|
# DEP requires DEP
|
||||||
|
|
|
@ -339,7 +339,6 @@ def test_ner_warns_no_lookups(caplog):
|
||||||
nlp.vocab.lookups = Lookups()
|
nlp.vocab.lookups = Lookups()
|
||||||
assert not len(nlp.vocab.lookups)
|
assert not len(nlp.vocab.lookups)
|
||||||
nlp.add_pipe("ner")
|
nlp.add_pipe("ner")
|
||||||
nlp.config["initialize"]["lookups"] = None
|
|
||||||
with caplog.at_level(logging.DEBUG):
|
with caplog.at_level(logging.DEBUG):
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
assert "W033" in caplog.text
|
assert "W033" in caplog.text
|
||||||
|
|
|
@ -69,9 +69,9 @@ def test_attributeruler_init(nlp, pattern_dicts):
|
||||||
a.add(**p)
|
a.add(**p)
|
||||||
doc = nlp("This is a test.")
|
doc = nlp("This is a test.")
|
||||||
assert doc[2].lemma_ == "the"
|
assert doc[2].lemma_ == "the"
|
||||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||||
assert doc[3].lemma_ == "cat"
|
assert doc[3].lemma_ == "cat"
|
||||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||||
assert doc.has_annotation("LEMMA")
|
assert doc.has_annotation("LEMMA")
|
||||||
assert doc.has_annotation("MORPH")
|
assert doc.has_annotation("MORPH")
|
||||||
|
|
||||||
|
@ -81,9 +81,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
||||||
doc = nlp("This is a test.")
|
doc = nlp("This is a test.")
|
||||||
assert doc[2].lemma_ == "the"
|
assert doc[2].lemma_ == "the"
|
||||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||||
assert doc[3].lemma_ == "cat"
|
assert doc[3].lemma_ == "cat"
|
||||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||||
assert doc.has_annotation("LEMMA")
|
assert doc.has_annotation("LEMMA")
|
||||||
assert doc.has_annotation("MORPH")
|
assert doc.has_annotation("MORPH")
|
||||||
nlp.remove_pipe("attribute_ruler")
|
nlp.remove_pipe("attribute_ruler")
|
||||||
|
@ -94,9 +94,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
)
|
)
|
||||||
doc = nlp("This is a test.")
|
doc = nlp("This is a test.")
|
||||||
assert doc[2].lemma_ == "the"
|
assert doc[2].lemma_ == "the"
|
||||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||||
assert doc[3].lemma_ == "cat"
|
assert doc[3].lemma_ == "cat"
|
||||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||||
assert doc.has_annotation("LEMMA")
|
assert doc.has_annotation("LEMMA")
|
||||||
assert doc.has_annotation("MORPH")
|
assert doc.has_annotation("MORPH")
|
||||||
|
|
||||||
|
@ -106,9 +106,9 @@ def test_attributeruler_score(nlp, pattern_dicts):
|
||||||
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
||||||
doc = nlp("This is a test.")
|
doc = nlp("This is a test.")
|
||||||
assert doc[2].lemma_ == "the"
|
assert doc[2].lemma_ == "the"
|
||||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
|
||||||
assert doc[3].lemma_ == "cat"
|
assert doc[3].lemma_ == "cat"
|
||||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
|
||||||
|
|
||||||
dev_examples = [
|
dev_examples = [
|
||||||
Example.from_dict(
|
Example.from_dict(
|
||||||
|
@ -150,10 +150,10 @@ def test_attributeruler_tag_map(nlp, tag_map):
|
||||||
for i in range(len(doc)):
|
for i in range(len(doc)):
|
||||||
if i == 4:
|
if i == 4:
|
||||||
assert doc[i].pos_ == "PUNCT"
|
assert doc[i].pos_ == "PUNCT"
|
||||||
assert doc[i].morph_ == "PunctType=peri"
|
assert str(doc[i].morph) == "PunctType=peri"
|
||||||
else:
|
else:
|
||||||
assert doc[i].pos_ == ""
|
assert doc[i].pos_ == ""
|
||||||
assert doc[i].morph_ == ""
|
assert str(doc[i].morph) == ""
|
||||||
|
|
||||||
|
|
||||||
def test_attributeruler_morph_rules(nlp, morph_rules):
|
def test_attributeruler_morph_rules(nlp, morph_rules):
|
||||||
|
@ -168,11 +168,11 @@ def test_attributeruler_morph_rules(nlp, morph_rules):
|
||||||
for i in range(len(doc)):
|
for i in range(len(doc)):
|
||||||
if i != 2:
|
if i != 2:
|
||||||
assert doc[i].pos_ == ""
|
assert doc[i].pos_ == ""
|
||||||
assert doc[i].morph_ == ""
|
assert str(doc[i].morph) == ""
|
||||||
else:
|
else:
|
||||||
assert doc[2].pos_ == "DET"
|
assert doc[2].pos_ == "DET"
|
||||||
assert doc[2].lemma_ == "a"
|
assert doc[2].lemma_ == "a"
|
||||||
assert doc[2].morph_ == "Case=Nom"
|
assert str(doc[2].morph) == "Case=Nom"
|
||||||
|
|
||||||
|
|
||||||
def test_attributeruler_indices(nlp):
|
def test_attributeruler_indices(nlp):
|
||||||
|
@ -194,14 +194,14 @@ def test_attributeruler_indices(nlp):
|
||||||
for i in range(len(doc)):
|
for i in range(len(doc)):
|
||||||
if i == 1:
|
if i == 1:
|
||||||
assert doc[i].lemma_ == "was"
|
assert doc[i].lemma_ == "was"
|
||||||
assert doc[i].morph_ == "Case=Nom|Number=Sing"
|
assert str(doc[i].morph) == "Case=Nom|Number=Sing"
|
||||||
elif i == 2:
|
elif i == 2:
|
||||||
assert doc[i].lemma_ == "the"
|
assert doc[i].lemma_ == "the"
|
||||||
assert doc[i].morph_ == "Case=Nom|Number=Plur"
|
assert str(doc[i].morph) == "Case=Nom|Number=Plur"
|
||||||
elif i == 3:
|
elif i == 3:
|
||||||
assert doc[i].lemma_ == "cat"
|
assert doc[i].lemma_ == "cat"
|
||||||
else:
|
else:
|
||||||
assert doc[i].morph_ == ""
|
assert str(doc[i].morph) == ""
|
||||||
# raises an error when trying to modify a token outside of the match
|
# raises an error when trying to modify a token outside of the match
|
||||||
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
|
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
|
|
|
@ -91,7 +91,7 @@ def test_overfitting_IO():
|
||||||
doc = nlp(test_text)
|
doc = nlp(test_text)
|
||||||
gold_morphs = ["Feat=N", "Feat=V", "", ""]
|
gold_morphs = ["Feat=N", "Feat=V", "", ""]
|
||||||
gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
|
gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
|
||||||
assert [t.morph_ for t in doc] == gold_morphs
|
assert [str(t.morph) for t in doc] == gold_morphs
|
||||||
assert [t.pos_ for t in doc] == gold_pos_tags
|
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||||
|
|
||||||
# Also test the results are still the same after IO
|
# Also test the results are still the same after IO
|
||||||
|
@ -99,5 +99,5 @@ def test_overfitting_IO():
|
||||||
nlp.to_disk(tmp_dir)
|
nlp.to_disk(tmp_dir)
|
||||||
nlp2 = util.load_model_from_path(tmp_dir)
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
doc2 = nlp2(test_text)
|
doc2 = nlp2(test_text)
|
||||||
assert [t.morph_ for t in doc2] == gold_morphs
|
assert [str(t.morph) for t in doc2] == gold_morphs
|
||||||
assert [t.pos_ for t in doc2] == gold_pos_tags
|
assert [t.pos_ for t in doc2] == gold_pos_tags
|
||||||
|
|
|
@ -59,7 +59,7 @@ def test_issue3012(en_vocab):
|
||||||
words = ["This", "is", "10", "%", "."]
|
words = ["This", "is", "10", "%", "."]
|
||||||
tags = ["DT", "VBZ", "CD", "NN", "."]
|
tags = ["DT", "VBZ", "CD", "NN", "."]
|
||||||
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
|
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
|
||||||
ents = [("PERCENT", 2, 4)]
|
ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
|
||||||
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
|
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
|
||||||
assert doc.has_annotation("TAG")
|
assert doc.has_annotation("TAG")
|
||||||
expected = ("10", "NUM", "CD", "PERCENT")
|
expected = ("10", "NUM", "CD", "PERCENT")
|
||||||
|
|
|
@ -76,7 +76,7 @@ def tagged_doc():
|
||||||
for i in range(len(tags)):
|
for i in range(len(tags)):
|
||||||
doc[i].tag_ = tags[i]
|
doc[i].tag_ = tags[i]
|
||||||
doc[i].pos_ = pos[i]
|
doc[i].pos_ = pos[i]
|
||||||
doc[i].morph_ = morphs[i]
|
doc[i].set_morph(morphs[i])
|
||||||
if i > 0:
|
if i > 0:
|
||||||
doc[i].is_sent_start = False
|
doc[i].is_sent_start = False
|
||||||
return doc
|
return doc
|
||||||
|
@ -184,7 +184,7 @@ def test_ner_per_type(en_vocab):
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
en_vocab,
|
en_vocab,
|
||||||
words=input_.split(" "),
|
words=input_.split(" "),
|
||||||
ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
|
ents=["B-CARDINAL", "O", "B-CARDINAL"],
|
||||||
)
|
)
|
||||||
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
||||||
example = Example.from_dict(doc, {"entities": entities})
|
example = Example.from_dict(doc, {"entities": entities})
|
||||||
|
@ -209,7 +209,7 @@ def test_ner_per_type(en_vocab):
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
en_vocab,
|
en_vocab,
|
||||||
words=input_.split(" "),
|
words=input_.split(" "),
|
||||||
ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
|
ents=["B-ORG", "O", "O", "O", "O", "B-GPE", "B-ORG", "O", "O", "O"],
|
||||||
)
|
)
|
||||||
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
||||||
example = Example.from_dict(doc, {"entities": entities})
|
example = Example.from_dict(doc, {"entities": entities})
|
||||||
|
@ -242,7 +242,7 @@ def test_tag_score(tagged_doc):
|
||||||
gold = {
|
gold = {
|
||||||
"tags": [t.tag_ for t in tagged_doc],
|
"tags": [t.tag_ for t in tagged_doc],
|
||||||
"pos": [t.pos_ for t in tagged_doc],
|
"pos": [t.pos_ for t in tagged_doc],
|
||||||
"morphs": [t.morph_ for t in tagged_doc],
|
"morphs": [str(t.morph) for t in tagged_doc],
|
||||||
"sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc],
|
"sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc],
|
||||||
}
|
}
|
||||||
example = Example.from_dict(tagged_doc, gold)
|
example = Example.from_dict(tagged_doc, gold)
|
||||||
|
@ -259,7 +259,7 @@ def test_tag_score(tagged_doc):
|
||||||
tags[0] = "NN"
|
tags[0] = "NN"
|
||||||
pos = [t.pos_ for t in tagged_doc]
|
pos = [t.pos_ for t in tagged_doc]
|
||||||
pos[1] = "X"
|
pos[1] = "X"
|
||||||
morphs = [t.morph_ for t in tagged_doc]
|
morphs = [str(t.morph) for t in tagged_doc]
|
||||||
morphs[1] = "Number=sing"
|
morphs[1] = "Number=sing"
|
||||||
morphs[2] = "Number=plur"
|
morphs[2] = "Number=plur"
|
||||||
gold = {
|
gold = {
|
||||||
|
|
|
@ -113,7 +113,7 @@ def test_Example_from_dict_with_morphology(annots):
|
||||||
predicted = Doc(vocab, words=annots["words"])
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
example = Example.from_dict(predicted, annots)
|
example = Example.from_dict(predicted, annots)
|
||||||
for i, token in enumerate(example.reference):
|
for i, token in enumerate(example.reference):
|
||||||
assert token.morph_ == annots["morphs"][i]
|
assert str(token.morph) == annots["morphs"][i]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
|
|
@ -30,7 +30,12 @@ def doc(en_vocab):
|
||||||
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
|
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
|
||||||
deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
|
deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
|
||||||
lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
|
lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
|
||||||
ents = (("PERSON", 0, 2), ("LOC", 5, 7), ("GPE", 8, 9))
|
ents = ["O"] * len(words)
|
||||||
|
ents[0] = "B-PERSON"
|
||||||
|
ents[1] = "I-PERSON"
|
||||||
|
ents[5] = "B-LOC"
|
||||||
|
ents[6] = "I-LOC"
|
||||||
|
ents[8] = "B-GPE"
|
||||||
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
||||||
# fmt: on
|
# fmt: on
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
|
@ -455,7 +460,7 @@ def test_roundtrip_docs_to_docbin(doc):
|
||||||
idx = [t.idx for t in doc]
|
idx = [t.idx for t in doc]
|
||||||
tags = [t.tag_ for t in doc]
|
tags = [t.tag_ for t in doc]
|
||||||
pos = [t.pos_ for t in doc]
|
pos = [t.pos_ for t in doc]
|
||||||
morphs = [t.morph_ for t in doc]
|
morphs = [str(t.morph) for t in doc]
|
||||||
lemmas = [t.lemma_ for t in doc]
|
lemmas = [t.lemma_ for t in doc]
|
||||||
deps = [t.dep_ for t in doc]
|
deps = [t.dep_ for t in doc]
|
||||||
heads = [t.head.i for t in doc]
|
heads = [t.head.i for t in doc]
|
||||||
|
@ -477,7 +482,7 @@ def test_roundtrip_docs_to_docbin(doc):
|
||||||
assert idx == [t.idx for t in reloaded_example.reference]
|
assert idx == [t.idx for t in reloaded_example.reference]
|
||||||
assert tags == [t.tag_ for t in reloaded_example.reference]
|
assert tags == [t.tag_ for t in reloaded_example.reference]
|
||||||
assert pos == [t.pos_ for t in reloaded_example.reference]
|
assert pos == [t.pos_ for t in reloaded_example.reference]
|
||||||
assert morphs == [t.morph_ for t in reloaded_example.reference]
|
assert morphs == [str(t.morph) for t in reloaded_example.reference]
|
||||||
assert lemmas == [t.lemma_ for t in reloaded_example.reference]
|
assert lemmas == [t.lemma_ for t in reloaded_example.reference]
|
||||||
assert deps == [t.dep_ for t in reloaded_example.reference]
|
assert deps == [t.dep_ for t in reloaded_example.reference]
|
||||||
assert heads == [t.head.i for t in reloaded_example.reference]
|
assert heads == [t.head.i for t in reloaded_example.reference]
|
||||||
|
|
|
@ -101,7 +101,7 @@ class DocBin:
|
||||||
self.strings.add(token.text)
|
self.strings.add(token.text)
|
||||||
self.strings.add(token.tag_)
|
self.strings.add(token.tag_)
|
||||||
self.strings.add(token.lemma_)
|
self.strings.add(token.lemma_)
|
||||||
self.strings.add(token.morph_)
|
self.strings.add(str(token.morph))
|
||||||
self.strings.add(token.dep_)
|
self.strings.add(token.dep_)
|
||||||
self.strings.add(token.ent_type_)
|
self.strings.add(token.ent_type_)
|
||||||
self.strings.add(token.ent_kb_id_)
|
self.strings.add(token.ent_kb_id_)
|
||||||
|
|
|
@ -213,8 +213,9 @@ cdef class Doc:
|
||||||
sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
|
sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
|
||||||
the same length as words, to assign as token.is_sent_start. Will be
|
the same length as words, to assign as token.is_sent_start. Will be
|
||||||
overridden by heads if heads is provided. Defaults to None.
|
overridden by heads if heads is provided. Defaults to None.
|
||||||
ents (Optional[List[Tuple[Union[str, int], int, int]]]): A list of
|
ents (Optional[List[str]]): A list of unicode strings, of the same
|
||||||
(label, start, end) tuples to assign as doc.ents. Defaults to None.
|
length as words, as IOB tags to assign as token.ent_iob and
|
||||||
|
token.ent_type. Defaults to None.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/doc#init
|
DOCS: https://nightly.spacy.io/api/doc#init
|
||||||
"""
|
"""
|
||||||
|
@ -275,16 +276,55 @@ cdef class Doc:
|
||||||
sent_starts[i] = -1
|
sent_starts[i] = -1
|
||||||
elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
|
elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
|
||||||
sent_starts[i] = 0
|
sent_starts[i] = 0
|
||||||
|
ent_iobs = None
|
||||||
|
ent_types = None
|
||||||
|
if ents is not None:
|
||||||
|
iob_strings = Token.iob_strings()
|
||||||
|
# make valid IOB2 out of IOB1 or IOB2
|
||||||
|
for i, ent in enumerate(ents):
|
||||||
|
if ent is "":
|
||||||
|
ents[i] = None
|
||||||
|
elif ent is not None and not isinstance(ent, str):
|
||||||
|
raise ValueError(Errors.E177.format(tag=ent))
|
||||||
|
if i < len(ents) - 1:
|
||||||
|
# OI -> OB
|
||||||
|
if (ent is None or ent.startswith("O")) and \
|
||||||
|
(ents[i+1] is not None and ents[i+1].startswith("I")):
|
||||||
|
ents[i+1] = "B" + ents[i+1][1:]
|
||||||
|
# B-TYPE1 I-TYPE2 or I-TYPE1 I-TYPE2 -> B/I-TYPE1 B-TYPE2
|
||||||
|
if ent is not None and ents[i+1] is not None and \
|
||||||
|
(ent.startswith("B") or ent.startswith("I")) and \
|
||||||
|
ents[i+1].startswith("I") and \
|
||||||
|
ent[1:] != ents[i+1][1:]:
|
||||||
|
ents[i+1] = "B" + ents[i+1][1:]
|
||||||
|
ent_iobs = []
|
||||||
|
ent_types = []
|
||||||
|
for ent in ents:
|
||||||
|
if ent is None:
|
||||||
|
ent_iobs.append(iob_strings.index(""))
|
||||||
|
ent_types.append("")
|
||||||
|
elif ent == "O":
|
||||||
|
ent_iobs.append(iob_strings.index(ent))
|
||||||
|
ent_types.append("")
|
||||||
|
else:
|
||||||
|
if len(ent) < 3 or ent[1] != "-":
|
||||||
|
raise ValueError(Errors.E177.format(tag=ent))
|
||||||
|
ent_iob, ent_type = ent.split("-", 1)
|
||||||
|
if ent_iob not in iob_strings:
|
||||||
|
raise ValueError(Errors.E177.format(tag=ent))
|
||||||
|
ent_iob = iob_strings.index(ent_iob)
|
||||||
|
ent_iobs.append(ent_iob)
|
||||||
|
ent_types.append(ent_type)
|
||||||
headings = []
|
headings = []
|
||||||
values = []
|
values = []
|
||||||
annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts]
|
annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts, ent_iobs, ent_types]
|
||||||
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START]
|
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START, ENT_IOB, ENT_TYPE]
|
||||||
for a, annot in enumerate(annotations):
|
for a, annot in enumerate(annotations):
|
||||||
if annot is not None:
|
if annot is not None:
|
||||||
if len(annot) != len(words):
|
if len(annot) != len(words):
|
||||||
raise ValueError(Errors.E189)
|
raise ValueError(Errors.E189)
|
||||||
headings.append(possible_headings[a])
|
headings.append(possible_headings[a])
|
||||||
if annot is not heads and annot is not sent_starts:
|
if annot is not heads and annot is not sent_starts and annot is not ent_iobs:
|
||||||
values.extend(annot)
|
values.extend(annot)
|
||||||
for value in values:
|
for value in values:
|
||||||
self.vocab.strings.add(value)
|
self.vocab.strings.add(value)
|
||||||
|
@ -296,7 +336,7 @@ cdef class Doc:
|
||||||
j = 0
|
j = 0
|
||||||
for annot in annotations:
|
for annot in annotations:
|
||||||
if annot:
|
if annot:
|
||||||
if annot is heads or annot is sent_starts:
|
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
if attrs.ndim == 1:
|
if attrs.ndim == 1:
|
||||||
attrs[i] = annot[i]
|
attrs[i] = annot[i]
|
||||||
|
@ -317,8 +357,6 @@ cdef class Doc:
|
||||||
attrs[i, j] = self.vocab.strings[annot[i]]
|
attrs[i, j] = self.vocab.strings[annot[i]]
|
||||||
j += 1
|
j += 1
|
||||||
self.from_array(headings, attrs)
|
self.from_array(headings, attrs)
|
||||||
if ents is not None:
|
|
||||||
self.ents = ents
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _(self):
|
def _(self):
|
||||||
|
@ -1210,7 +1248,7 @@ cdef class Doc:
|
||||||
for token in self:
|
for token in self:
|
||||||
strings.add(token.tag_)
|
strings.add(token.tag_)
|
||||||
strings.add(token.lemma_)
|
strings.add(token.lemma_)
|
||||||
strings.add(token.morph_)
|
strings.add(str(token.morph))
|
||||||
strings.add(token.dep_)
|
strings.add(token.dep_)
|
||||||
strings.add(token.ent_type_)
|
strings.add(token.ent_type_)
|
||||||
strings.add(token.ent_kb_id_)
|
strings.add(token.ent_kb_id_)
|
||||||
|
|
|
@ -215,20 +215,20 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return MorphAnalysis.from_id(self.vocab, self.c.morph)
|
return MorphAnalysis.from_id(self.vocab, self.c.morph)
|
||||||
|
|
||||||
def __set__(self, attr_t morph):
|
def __set__(self, MorphAnalysis morph):
|
||||||
if morph == 0:
|
# Check that the morph has the same vocab
|
||||||
self.c.morph = morph
|
if self.vocab != morph.vocab:
|
||||||
elif morph in self.vocab.strings:
|
raise ValueError(Errors.E1013)
|
||||||
self.morph_ = self.vocab.strings[morph]
|
self.c.morph = morph.c.key
|
||||||
|
|
||||||
|
def set_morph(self, features):
|
||||||
|
cdef hash_t key
|
||||||
|
if features is 0:
|
||||||
|
self.c.morph = 0
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E1009.format(val=morph))
|
if isinstance(features, int):
|
||||||
|
features = self.vocab.strings[features]
|
||||||
property morph_:
|
key = self.vocab.morphology.add(features)
|
||||||
def __get__(self):
|
|
||||||
return str(MorphAnalysis.from_id(self.vocab, self.c.morph))
|
|
||||||
|
|
||||||
def __set__(self, features):
|
|
||||||
cdef hash_t key = self.vocab.morphology.add(features)
|
|
||||||
self.c.morph = key
|
self.c.morph = key
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -207,6 +207,7 @@ def conllu_sentence_to_doc(
|
||||||
pos=poses,
|
pos=poses,
|
||||||
deps=deps,
|
deps=deps,
|
||||||
lemmas=lemmas,
|
lemmas=lemmas,
|
||||||
|
morphs=morphs,
|
||||||
heads=heads,
|
heads=heads,
|
||||||
)
|
)
|
||||||
for i in range(len(doc)):
|
for i in range(len(doc)):
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from collections import Iterable as IterableInstance
|
from collections.abc import Iterable as IterableInstance
|
||||||
import warnings
|
import warnings
|
||||||
import numpy
|
import numpy
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
@ -226,7 +226,7 @@ cdef class Example:
|
||||||
"TAG": [t.tag_ for t in self.reference],
|
"TAG": [t.tag_ for t in self.reference],
|
||||||
"LEMMA": [t.lemma_ for t in self.reference],
|
"LEMMA": [t.lemma_ for t in self.reference],
|
||||||
"POS": [t.pos_ for t in self.reference],
|
"POS": [t.pos_ for t in self.reference],
|
||||||
"MORPH": [t.morph_ for t in self.reference],
|
"MORPH": [str(t.morph) for t in self.reference],
|
||||||
"HEAD": [t.head.i for t in self.reference],
|
"HEAD": [t.head.i for t in self.reference],
|
||||||
"DEP": [t.dep_ for t in self.reference],
|
"DEP": [t.dep_ for t in self.reference],
|
||||||
"SENT_START": [int(bool(t.is_sent_start)) for t in self.reference]
|
"SENT_START": [int(bool(t.is_sent_start)) for t in self.reference]
|
||||||
|
|
|
@ -44,7 +44,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
||||||
if include_annotation["POS"]:
|
if include_annotation["POS"]:
|
||||||
json_token["pos"] = token.pos_
|
json_token["pos"] = token.pos_
|
||||||
if include_annotation["MORPH"]:
|
if include_annotation["MORPH"]:
|
||||||
json_token["morph"] = token.morph_
|
json_token["morph"] = str(token.morph)
|
||||||
if include_annotation["LEMMA"]:
|
if include_annotation["LEMMA"]:
|
||||||
json_token["lemma"] = token.lemma_
|
json_token["lemma"] = token.lemma_
|
||||||
if include_annotation["DEP"]:
|
if include_annotation["DEP"]:
|
||||||
|
|
|
@ -144,9 +144,9 @@ argument that connects to the shared `tok2vec` component in the pipeline.
|
||||||
Construct an embedding layer that separately embeds a number of lexical
|
Construct an embedding layer that separately embeds a number of lexical
|
||||||
attributes using hash embedding, concatenates the results, and passes it through
|
attributes using hash embedding, concatenates the results, and passes it through
|
||||||
a feed-forward subnetwork to build mixed representations. The features used are
|
a feed-forward subnetwork to build mixed representations. The features used are
|
||||||
the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying definitions
|
the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a
|
||||||
depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained
|
[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static
|
||||||
static vectors can also be incorporated into the concatenated representation.
|
vectors can also be incorporated into the concatenated representation.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
@ -291,6 +291,24 @@ on [static vectors](/usage/embeddings-transformers#static-vectors) for details.
|
||||||
| `key_attr` | Defaults to `"ORTH"`. ~~str~~ |
|
| `key_attr` | Defaults to `"ORTH"`. ~~str~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ |
|
||||||
|
|
||||||
|
### spacy.FeatureExtractor.v1 {#FeatureExtractor}
|
||||||
|
|
||||||
|
> #### Example config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model]
|
||||||
|
> @architectures = "spacy.FeatureExtractor.v1"
|
||||||
|
> columns = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list
|
||||||
|
of feature names to extract, which should refer to token attributes.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------------------------------ |
|
||||||
|
| `columns` | The token attributes to extract. ~~List[Union[int, str]]~~ |
|
||||||
|
| **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |
|
||||||
|
|
||||||
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
|
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
|
||||||
|
|
||||||
The following architectures are provided by the package
|
The following architectures are provided by the package
|
||||||
|
|
|
@ -186,15 +186,14 @@ This functionality was previously available as part of the command `init-model`.
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--lexemes-jsonl] [--verbose]
|
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
|
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ |
|
||||||
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
|
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
|
||||||
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
||||||
| `--lexemes-jsonl`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~ |
|
|
||||||
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||||
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
||||||
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
|
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
|
||||||
|
@ -202,6 +201,39 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
|
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
|
||||||
|
|
||||||
|
### init labels {#init-labels new="3" tag="command"}
|
||||||
|
|
||||||
|
Generate JSON files for the labels in the data. This helps speed up the training
|
||||||
|
process, since spaCy won't have to preprocess the data to extract the labels.
|
||||||
|
After generating the labels, you can provide them to components that accept a
|
||||||
|
`labels` argument on initialization via the
|
||||||
|
[`[initialize]`](/api/data-formats#config-initialize) block of your config.
|
||||||
|
|
||||||
|
> #### Example config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [initialize.components.ner]
|
||||||
|
>
|
||||||
|
> [initialize.components.ner.labels]
|
||||||
|
> @readers = "spacy.read_labels.v1"
|
||||||
|
> path = "corpus/labels/ner.json
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [--gpu-id] [overrides]
|
||||||
|
```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||||
|
| `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ |
|
||||||
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
|
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
||||||
|
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||||
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||||
|
| **CREATES** | The final trained pipeline and the best trained pipeline. |
|
||||||
|
|
||||||
## convert {#convert tag="command"}
|
## convert {#convert tag="command"}
|
||||||
|
|
||||||
Convert files into spaCy's
|
Convert files into spaCy's
|
||||||
|
|
|
@ -238,8 +238,6 @@ without requiring them at runtime when you load the trained pipeline back in.
|
||||||
> data_path = "/path/to/component_data"
|
> data_path = "/path/to/component_data"
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
<!-- TODO: -->
|
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `components` | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~ |
|
| `components` | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~ |
|
||||||
|
@ -454,15 +452,20 @@ example = Example.from_dict(doc, gold_dict)
|
||||||
|
|
||||||
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
## Lexical data for vocabulary {#vocab-jsonl new="2"}
|
||||||
|
|
||||||
To populate a pipeline's vocabulary, you can use the
|
This data file can be provided via the `vocab_data` setting in the
|
||||||
[`spacy init vectors`](/api/cli#init-vectors) command and load in a
|
`[initialize]` block of the training config to pre-define the lexical data to
|
||||||
[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one
|
initialize the `nlp` object's vocabulary with. The file should contain one
|
||||||
lexical entry per line via the `--jsonl-loc` option. The first line defines the
|
lexical entry per line. The first line defines the language and vocabulary
|
||||||
language and vocabulary settings. All other lines are expected to be JSON
|
settings. All other lines are expected to be JSON objects describing an
|
||||||
objects describing an individual lexeme. The lexical attributes will be then set
|
individual lexeme. The lexical attributes will be then set as attributes on
|
||||||
as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab`
|
spaCy's [`Lexeme`](/api/lexeme#attributes) object.
|
||||||
command outputs a ready-to-use spaCy pipeline with a `Vocab` containing the
|
|
||||||
lexical data.
|
> #### Example config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [initialize]
|
||||||
|
> vocab_data = "/path/to/vocab-data.jsonl"
|
||||||
|
> ```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### First line
|
### First line
|
||||||
|
|
|
@ -21,8 +21,9 @@ non-projective parses.
|
||||||
The parser is trained using an **imitation learning objective**. It follows the
|
The parser is trained using an **imitation learning objective**. It follows the
|
||||||
actions predicted by the current weights, and at each state, determines which
|
actions predicted by the current weights, and at each state, determines which
|
||||||
actions are compatible with the optimal parse that could be reached from the
|
actions are compatible with the optimal parse that could be reached from the
|
||||||
current state. The weights are updated such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note
|
current state. The weights are updated such that the scores assigned to the set
|
||||||
that more than one action may be optimal for a given state.
|
of optimal actions is increased, while scores assigned to other actions are
|
||||||
|
decreased. Note that more than one action may be optimal for a given state.
|
||||||
|
|
||||||
## Config and implementation {#config}
|
## Config and implementation {#config}
|
||||||
|
|
||||||
|
@ -139,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## DependencyParser.initialize {#initialize tag="method"}
|
## DependencyParser.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||||
|
@ -148,7 +149,10 @@ training data or a representative sample. Initialization includes validating the
|
||||||
network,
|
network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize).
|
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
arguments it receives via the
|
||||||
|
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||||
|
config.
|
||||||
|
|
||||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||||
|
|
||||||
|
@ -162,12 +166,22 @@ This method was previously called `begin_training`.
|
||||||
> parser = nlp.add_pipe("parser")
|
> parser = nlp.add_pipe("parser")
|
||||||
> parser.initialize(lambda: [], nlp=nlp)
|
> parser.initialize(lambda: [], nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> ### config.cfg
|
||||||
|
> [initialize.components.parser]
|
||||||
|
>
|
||||||
|
> [initialize.components.parser.labels]
|
||||||
|
> @readers = "spacy.read_labels.v1"
|
||||||
|
> path = "corpus/labels/parser.json
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
||||||
|
|
||||||
## DependencyParser.predict {#predict tag="method"}
|
## DependencyParser.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -32,7 +32,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||||
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
|
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
|
||||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||||
|
@ -45,7 +45,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
||||||
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
|
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
|
||||||
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ |
|
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ |
|
||||||
| `ents` <Tag variant="new">3</Tag> | A list of `(label, start, end)` tuples to assign as `doc.ents`. Note that the `start` and `end` indices here refer to the token indices. Defaults to `None`. ~~Optional[List[Tuple[Union[str, int], int, int]]]~~ |
|
| `ents` <Tag variant="new">3</Tag> | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
|
|
||||||
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
||||||
|
|
||||||
|
@ -503,7 +503,9 @@ invalidated, although they may accidentally continue to work.
|
||||||
Mark a span for merging. The `attrs` will be applied to the resulting token (if
|
Mark a span for merging. The `attrs` will be applied to the resulting token (if
|
||||||
they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
|
they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
|
||||||
underlying lexeme (if they're context-independent lexical attributes like
|
underlying lexeme (if they're context-independent lexical attributes like
|
||||||
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute names to values.
|
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided
|
||||||
|
using the `"_"` key and specifying a dictionary that maps attribute names to
|
||||||
|
values.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
|
@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## EntityLinker.initialize {#initialize tag="method"}
|
## EntityLinker.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||||
|
|
|
@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## EntityRecognizer.initialize {#initialize tag="method"}
|
## EntityRecognizer.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||||
|
@ -138,7 +138,10 @@ training data or a representative sample. Initialization includes validating the
|
||||||
network,
|
network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize).
|
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
arguments it receives via the
|
||||||
|
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||||
|
config.
|
||||||
|
|
||||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||||
|
|
||||||
|
@ -152,12 +155,22 @@ This method was previously called `begin_training`.
|
||||||
> ner = nlp.add_pipe("ner")
|
> ner = nlp.add_pipe("ner")
|
||||||
> ner.initialize(lambda: [], nlp=nlp)
|
> ner.initialize(lambda: [], nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> ### config.cfg
|
||||||
|
> [initialize.components.ner]
|
||||||
|
>
|
||||||
|
> [initialize.components.ner.labels]
|
||||||
|
> @readers = "spacy.read_labels.v1"
|
||||||
|
> path = "corpus/labels/ner.json
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
||||||
|
|
||||||
## EntityRecognizer.predict {#predict tag="method"}
|
## EntityRecognizer.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -202,7 +202,7 @@ more efficient than processing texts one-by-one.
|
||||||
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
|
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
|
||||||
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
|
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
|
||||||
|
|
||||||
## Language.initialize {#initialize tag="method"}
|
## Language.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the pipeline for training and return an
|
Initialize the pipeline for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the
|
||||||
|
|
|
@ -126,7 +126,10 @@ training data or a representative sample. Initialization includes validating the
|
||||||
network,
|
network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize).
|
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
arguments it receives via the
|
||||||
|
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||||
|
config.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -134,12 +137,22 @@ by [`Language.initialize`](/api/language#initialize).
|
||||||
> morphologizer = nlp.add_pipe("morphologizer")
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> morphologizer.initialize(lambda: [], nlp=nlp)
|
> morphologizer.initialize(lambda: [], nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> ### config.cfg
|
||||||
|
> [initialize.components.morphologizer]
|
||||||
|
>
|
||||||
|
> [initialize.components.morphologizer.labels]
|
||||||
|
> @readers = "spacy.read_labels.v1"
|
||||||
|
> path = "corpus/labels/morphologizer.json
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
||||||
|
|
||||||
## Morphologizer.predict {#predict tag="method"}
|
## Morphologizer.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## Pipe.initialize {#initialize tag="method"}
|
## Pipe.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||||
|
|
|
@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## Tagger.initialize {#initialize tag="method"}
|
## Tagger.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||||
|
@ -121,7 +121,10 @@ training data or a representative sample. Initialization includes validating the
|
||||||
network,
|
network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize).
|
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
arguments it receives via the
|
||||||
|
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||||
|
config.
|
||||||
|
|
||||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||||
|
|
||||||
|
@ -135,12 +138,22 @@ This method was previously called `begin_training`.
|
||||||
> tagger = nlp.add_pipe("tagger")
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> tagger.initialize(lambda: [], nlp=nlp)
|
> tagger.initialize(lambda: [], nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> ### config.cfg
|
||||||
|
> [initialize.components.tagger]
|
||||||
|
>
|
||||||
|
> [initialize.components.tagger.labels]
|
||||||
|
> @readers = "spacy.read_labels.v1"
|
||||||
|
> path = "corpus/labels/tagger.json
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[list]~~ |
|
||||||
|
|
||||||
## Tagger.predict {#predict tag="method"}
|
## Tagger.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## TextCategorizer.initialize {#initialize tag="method"}
|
## TextCategorizer.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
||||||
|
@ -134,7 +134,10 @@ training data or a representative sample. Initialization includes validating the
|
||||||
network,
|
network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize).
|
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
arguments it receives via the
|
||||||
|
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||||
|
config.
|
||||||
|
|
||||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||||
|
|
||||||
|
@ -148,12 +151,22 @@ This method was previously called `begin_training`.
|
||||||
> textcat = nlp.add_pipe("textcat")
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> textcat.initialize(lambda: [], nlp=nlp)
|
> textcat.initialize(lambda: [], nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> ### config.cfg
|
||||||
|
> [initialize.components.textcat]
|
||||||
|
>
|
||||||
|
> [initialize.components.textcat.labels]
|
||||||
|
> @readers = "spacy.read_labels.v1"
|
||||||
|
> path = "corpus/labels/textcat.json
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
||||||
|
|
||||||
## TextCategorizer.predict {#predict tag="method"}
|
## TextCategorizer.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -538,6 +538,32 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
|
||||||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||||
| **CREATES** | The corpus reader. ~~JsonlTexts~~ |
|
| **CREATES** | The corpus reader. ~~JsonlTexts~~ |
|
||||||
|
|
||||||
|
### spacy.read_labels.v1 {#read_labels tag="registered function"}
|
||||||
|
|
||||||
|
Read a JSON-formatted labels file generated with
|
||||||
|
[`init labels`](/api/cli#init-labels). Typically used in the
|
||||||
|
[`[initialize]`](/api/data-formats#config-initialize) block of the training
|
||||||
|
config to speed up the model initialization process and provide pre-generated
|
||||||
|
label sets.
|
||||||
|
|
||||||
|
> #### Example config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [initialize.components]
|
||||||
|
>
|
||||||
|
> [initialize.components.ner]
|
||||||
|
>
|
||||||
|
> [initialize.components.ner.labels]
|
||||||
|
> @readers = "spacy.read_labels.v1"
|
||||||
|
> path = "corpus/labels/ner.json"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `path` | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~ |
|
||||||
|
| `require` | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
|
||||||
|
| **CREATES** | The |
|
||||||
|
|
||||||
## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
|
## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
|
||||||
|
|
||||||
A data batcher implements a batching strategy that essentially turns a stream of
|
A data batcher implements a batching strategy that essentially turns a stream of
|
||||||
|
|
|
@ -585,8 +585,9 @@ vectors, but combines them via summation with a smaller table of learned
|
||||||
embeddings.
|
embeddings.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from thinc.api import add, chain, remap_ids, Embed, FeatureExtractor
|
from thinc.api import add, chain, remap_ids, Embed
|
||||||
from spacy.ml.staticvectors import StaticVectors
|
from spacy.ml.staticvectors import StaticVectors
|
||||||
|
from spacy.ml.featureextractor import FeatureExtractor
|
||||||
from spacy.util import registry
|
from spacy.util import registry
|
||||||
|
|
||||||
@registry.architectures("my_example.MyEmbedding.v1")
|
@registry.architectures("my_example.MyEmbedding.v1")
|
||||||
|
|
|
@ -204,7 +204,19 @@ initialize it.
|
||||||
|
|
||||||
![Illustration of pipeline lifecycle](../images/lifecycle.svg)
|
![Illustration of pipeline lifecycle](../images/lifecycle.svg)
|
||||||
|
|
||||||
<!-- TODO: explain lifecycle and initialization -->
|
At runtime spaCy will only use the `[nlp]` and `[components]` blocks of the
|
||||||
|
config and load all data, including tokenization rules, model weights and other
|
||||||
|
resources from the pipeline directory. The `[training]` block contains the
|
||||||
|
settings for training the model and is only used during training. Similarly, the
|
||||||
|
`[initialize]` block defines how the initial `nlp` object should be set up
|
||||||
|
before training and whether it should be initialized with vectors or pretrained
|
||||||
|
tok2vec weights, or any other data needed by the components.
|
||||||
|
|
||||||
|
The initialization settings are only loaded and used when
|
||||||
|
[`nlp.initialize`](/api/language#initialize) is called (typically right before
|
||||||
|
training). This allows you to set up your pipeline using local data resources
|
||||||
|
and custom functions, and preserve the information in your config – but without
|
||||||
|
requiring it to be available at runtime
|
||||||
|
|
||||||
### Overwriting config settings on the command line {#config-overrides}
|
### Overwriting config settings on the command line {#config-overrides}
|
||||||
|
|
||||||
|
@ -803,6 +815,10 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
||||||
return create_model(output_width)
|
return create_model(output_width)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
<!-- TODO:
|
||||||
|
### Customizing the initialization {#initialization}
|
||||||
|
-->
|
||||||
|
|
||||||
## Data utilities {#data}
|
## Data utilities {#data}
|
||||||
|
|
||||||
spaCy includes various features and utilities to make it easy to train models
|
spaCy includes various features and utilities to make it easy to train models
|
||||||
|
@ -853,7 +869,7 @@ nlp = spacy.blank("en")
|
||||||
docbin = DocBin(nlp.vocab)
|
docbin = DocBin(nlp.vocab)
|
||||||
words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
|
words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
|
||||||
spaces = [True, True, True, True, True, True, True, False]
|
spaces = [True, True, True, True, True, True, True, False]
|
||||||
ents = [("ORG", 0, 1), ("GPE", 5, 6)]
|
ents = ["B-ORG", "O", "O", "O", "O", "B-GPE", "O", "O"]
|
||||||
doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
|
doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
|
||||||
docbin.add(doc)
|
docbin.add(doc)
|
||||||
docbin.to_disk("./train.spacy")
|
docbin.to_disk("./train.spacy")
|
||||||
|
|
|
@ -104,7 +104,6 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [training]
|
> [training]
|
||||||
> vectors = null
|
|
||||||
> accumulate_gradient = 3
|
> accumulate_gradient = 3
|
||||||
>
|
>
|
||||||
> [training.optimizer]
|
> [training.optimizer]
|
||||||
|
@ -430,6 +429,8 @@ The following methods, attributes and commands are new in spaCy v3.0.
|
||||||
| [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a pipeline's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). |
|
| [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a pipeline's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). |
|
||||||
| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. |
|
| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. |
|
||||||
| [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). |
|
| [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). |
|
||||||
|
| [`init vectors`](/api/cli#init-vectors) | Convert word vectors for use with spaCy. |
|
||||||
|
| [`init labels`](/api/cli#init-labels) | Generate JSON files for the labels in the data to speed up training. |
|
||||||
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
|
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
|
||||||
| [`ray`](/api/cli#ray) | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. |
|
| [`ray`](/api/cli#ray) | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. |
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,11 @@
|
||||||
const autoprefixer = require('autoprefixer')
|
const autoprefixer = require('autoprefixer')
|
||||||
const path = require('path')
|
const path = require('path')
|
||||||
|
|
||||||
|
// https://florian.ec/blog/gatsby-build-netlify-segmentation-fault/
|
||||||
|
const sharp = require('sharp')
|
||||||
|
sharp.cache(false)
|
||||||
|
sharp.simd(false)
|
||||||
|
|
||||||
// Markdown plugins
|
// Markdown plugins
|
||||||
const wrapSectionPlugin = require('./src/plugins/remark-wrap-section.js')
|
const wrapSectionPlugin = require('./src/plugins/remark-wrap-section.js')
|
||||||
const customAttrsPlugin = require('./src/plugins/remark-custom-attrs.js')
|
const customAttrsPlugin = require('./src/plugins/remark-custom-attrs.js')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user