mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Renaming gold & annotation_setter (#6042)
* version bump to 3.0.0a16 * rename "gold" folder to "training" * rename 'annotation_setter' to 'set_extra_annotations' * formatting
This commit is contained in:
parent
60f22e1800
commit
8e7557656f
|
@ -1,7 +1,7 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import plac
|
import plac
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.gold import docs_to_json
|
from spacy.training import docs_to_json
|
||||||
import srsly
|
import srsly
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -23,7 +23,7 @@ Options.docstrings = True
|
||||||
|
|
||||||
PACKAGES = find_packages()
|
PACKAGES = find_packages()
|
||||||
MOD_NAMES = [
|
MOD_NAMES = [
|
||||||
"spacy.gold.example",
|
"spacy.training.example",
|
||||||
"spacy.parts_of_speech",
|
"spacy.parts_of_speech",
|
||||||
"spacy.strings",
|
"spacy.strings",
|
||||||
"spacy.lexeme",
|
"spacy.lexeme",
|
||||||
|
@ -48,7 +48,7 @@ MOD_NAMES = [
|
||||||
"spacy.pipeline._parser_internals.stateclass",
|
"spacy.pipeline._parser_internals.stateclass",
|
||||||
"spacy.pipeline._parser_internals.transition_system",
|
"spacy.pipeline._parser_internals.transition_system",
|
||||||
"spacy.tokenizer",
|
"spacy.tokenizer",
|
||||||
"spacy.gold.gold_io",
|
"spacy.training.gold_io",
|
||||||
"spacy.tokens.doc",
|
"spacy.tokens.doc",
|
||||||
"spacy.tokens.span",
|
"spacy.tokens.span",
|
||||||
"spacy.tokens.token",
|
"spacy.tokens.token",
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a15"
|
__version__ = "3.0.0a16"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -7,9 +7,9 @@ import re
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt
|
||||||
from ..gold import docs_to_json
|
from ..training import docs_to_json
|
||||||
from ..tokens import DocBin
|
from ..tokens import DocBin
|
||||||
from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
|
from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
|
||||||
|
|
||||||
|
|
||||||
# Converters are matched by file extension except for ner/iob, which are
|
# Converters are matched by file extension except for ner/iob, which are
|
||||||
|
|
|
@ -8,7 +8,7 @@ import typer
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
from ._util import import_code, debug_cli, get_sourced_components
|
from ._util import import_code, debug_cli, get_sourced_components
|
||||||
from ..gold import Corpus, Example
|
from ..training import Corpus, Example
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
|
@ -5,7 +5,7 @@ import re
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import require_gpu, fix_random_seed
|
from thinc.api import require_gpu, fix_random_seed
|
||||||
|
|
||||||
from ..gold import Corpus
|
from ..training import Corpus
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
|
|
@ -16,7 +16,7 @@ from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code, get_sourced_components
|
from ._util import import_code, get_sourced_components
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..gold.example import Example
|
from ..training.example import Example
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -66,7 +66,7 @@ class Warnings:
|
||||||
"in problems with the vocab further on in the pipeline.")
|
"in problems with the vocab further on in the pipeline.")
|
||||||
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
|
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
|
||||||
"entities \"{entities}\". Use "
|
"entities \"{entities}\". Use "
|
||||||
"`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
|
"`spacy.training.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
|
||||||
" to check the alignment. Misaligned entities ('-') will be "
|
" to check the alignment. Misaligned entities ('-') will be "
|
||||||
"ignored during training.")
|
"ignored during training.")
|
||||||
W033 = ("Training a new {model} using a model with no lexeme normalization "
|
W033 = ("Training a new {model} using a model with no lexeme normalization "
|
||||||
|
|
|
@ -17,7 +17,7 @@ from timeit import default_timer as timer
|
||||||
from .tokens.underscore import Underscore
|
from .tokens.underscore import Underscore
|
||||||
from .vocab import Vocab, create_vocab
|
from .vocab import Vocab, create_vocab
|
||||||
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
||||||
from .gold import Example, validate_examples
|
from .training import Example, validate_examples
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .util import create_default_optimizer, registry, SimpleFrozenList
|
from .util import create_default_optimizer, registry, SimpleFrozenList
|
||||||
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
|
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
|
||||||
|
|
|
@ -8,7 +8,7 @@ from ...typedefs cimport hash_t, attr_t
|
||||||
from ...strings cimport hash_string
|
from ...strings cimport hash_string
|
||||||
from ...structs cimport TokenC
|
from ...structs cimport TokenC
|
||||||
from ...tokens.doc cimport Doc, set_children_from_heads
|
from ...tokens.doc cimport Doc, set_children_from_heads
|
||||||
from ...gold.example cimport Example
|
from ...training.example cimport Example
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
|
|
@ -5,7 +5,7 @@ from cymem.cymem cimport Pool
|
||||||
from ...typedefs cimport weight_t, attr_t
|
from ...typedefs cimport weight_t, attr_t
|
||||||
from ...lexeme cimport Lexeme
|
from ...lexeme cimport Lexeme
|
||||||
from ...attrs cimport IS_SPACE
|
from ...attrs cimport IS_SPACE
|
||||||
from ...gold.example cimport Example
|
from ...training.example cimport Example
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
|
|
@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
|
||||||
from ...typedefs cimport attr_t, weight_t
|
from ...typedefs cimport attr_t, weight_t
|
||||||
from ...structs cimport TokenC
|
from ...structs cimport TokenC
|
||||||
from ...strings cimport StringStore
|
from ...strings cimport StringStore
|
||||||
from ...gold.example cimport Example
|
from ...training.example cimport Example
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ from pathlib import Path
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..gold import validate_examples
|
from ..training import validate_examples
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..matcher import Matcher
|
from ..matcher import Matcher
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
|
|
@ -9,7 +9,7 @@ from .functions import merge_subtokens
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ._parser_internals import nonproj
|
from ._parser_internals import nonproj
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..gold import validate_examples
|
from ..training import validate_examples
|
||||||
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
|
|
|
@ -12,7 +12,7 @@ from ..tokens import Doc
|
||||||
from .pipe import Pipe, deserialize_config
|
from .pipe import Pipe, deserialize_config
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..gold import Example, validate_examples
|
from ..training import Example, validate_examples
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..util import SimpleFrozenList
|
from ..util import SimpleFrozenList
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
|
@ -9,7 +9,7 @@ from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..matcher import Matcher, PhraseMatcher
|
from ..matcher import Matcher, PhraseMatcher
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..gold import validate_examples
|
from ..training import validate_examples
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_ENT_ID_SEP = "||"
|
DEFAULT_ENT_ID_SEP = "||"
|
||||||
|
|
|
@ -8,7 +8,7 @@ from ..lookups import Lookups, load_lookups
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..tokens import Doc, Token
|
from ..tokens import Doc, Token
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..gold import validate_examples
|
from ..training import validate_examples
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@ from .pipe import deserialize_config
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..gold import validate_examples
|
from ..training import validate_examples
|
||||||
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
|
|
|
@ -8,7 +8,7 @@ from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
from ..gold import validate_examples
|
from ..training import validate_examples
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ._parser_internals import nonproj
|
from ._parser_internals import nonproj
|
||||||
from ..attrs import POS, ID
|
from ..attrs import POS, ID
|
||||||
|
|
|
@ -7,7 +7,7 @@ from ._parser_internals.ner cimport BiluoPushDown
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..gold import validate_examples
|
from ..training import validate_examples
|
||||||
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
|
|
|
@ -4,7 +4,7 @@ from thinc.api import set_dropout_rate, Model
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
from ..gold import validate_examples
|
from ..training import validate_examples
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ from ..tokens.doc cimport Doc
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..gold import validate_examples
|
from ..training import validate_examples
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ from .tagger import Tagger
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..gold import validate_examples
|
from ..training import validate_examples
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,8 +6,8 @@ from thinc.util import to_numpy
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob
|
from ..training import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob
|
||||||
from ..gold import validate_examples
|
from ..training import validate_examples
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
|
|
|
@ -17,7 +17,7 @@ from ..attrs import POS, ID
|
||||||
from ..parts_of_speech import X
|
from ..parts_of_speech import X
|
||||||
from ..errors import Errors, TempErrors, Warnings
|
from ..errors import Errors, TempErrors, Warnings
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..gold import validate_examples
|
from ..training import validate_examples
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ import numpy
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..gold import Example, validate_examples
|
from ..training import Example, validate_examples
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
|
@ -3,7 +3,7 @@ from thinc.api import Model, set_dropout_rate, Optimizer, Config
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..gold import Example, validate_examples
|
from ..training import Example, validate_examples
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
|
|
@ -21,7 +21,7 @@ from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
||||||
from ..ml.parser_model cimport get_c_weights, get_c_sizes
|
from ..ml.parser_model cimport get_c_weights, get_c_sizes
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
from ..gold import validate_examples
|
from ..training import validate_examples
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@ from .attrs import NAMES
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
# This lets us add type hints for mypy etc. without causing circular imports
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
from .language import Language # noqa: F401
|
from .language import Language # noqa: F401
|
||||||
from .gold import Example # noqa: F401
|
from .training import Example # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
ItemT = TypeVar("ItemT")
|
ItemT = TypeVar("ItemT")
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
|
from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from .gold import Example
|
from .training import Example
|
||||||
from .tokens import Token, Doc, Span
|
from .tokens import Token, Doc, Span
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from .util import get_lang_class, SimpleFrozenList
|
from .util import get_lang_class, SimpleFrozenList
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.pipeline import EntityRecognizer
|
from spacy.pipeline import EntityRecognizer
|
||||||
from spacy.tokens import Span, Doc
|
from spacy.tokens import Span, Doc
|
||||||
from spacy import registry
|
from spacy import registry
|
||||||
|
|
|
@ -3,7 +3,7 @@ from thinc.api import Adam, fix_random_seed
|
||||||
from spacy import registry
|
from spacy import registry
|
||||||
from spacy.attrs import NORM
|
from spacy.attrs import NORM
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.pipeline import DependencyParser, EntityRecognizer
|
from spacy.pipeline import DependencyParser, EntityRecognizer
|
||||||
from spacy.pipeline.ner import DEFAULT_NER_MODEL
|
from spacy.pipeline.ner import DEFAULT_NER_MODEL
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy import registry
|
from spacy import registry
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.pipeline import DependencyParser
|
from spacy.pipeline import DependencyParser
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.pipeline._parser_internals.nonproj import projectivize
|
from spacy.pipeline._parser_internals.nonproj import projectivize
|
||||||
|
|
|
@ -4,7 +4,7 @@ from spacy.lang.en import English
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.lookups import Lookups
|
from spacy.lookups import Lookups
|
||||||
from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
import logging
|
import logging
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy import registry
|
from spacy import registry
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.pipeline._parser_internals.arc_eager import ArcEager
|
from spacy.pipeline._parser_internals.arc_eager import ArcEager
|
||||||
from spacy.pipeline.transition_parser import Parser
|
from spacy.pipeline.transition_parser import Parser
|
||||||
|
|
|
@ -3,7 +3,7 @@ import pytest
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from ..util import get_doc, apply_transition_sequence, make_tempdir
|
from ..util import get_doc, apply_transition_sequence, make_tempdir
|
||||||
from ... import util
|
from ... import util
|
||||||
from ...gold import Example
|
from ...training import Example
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
(
|
(
|
||||||
|
|
|
@ -3,7 +3,7 @@ from thinc.api import Adam
|
||||||
from spacy.attrs import NORM
|
from spacy.attrs import NORM
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy import registry
|
from spacy import registry
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.pipeline import DependencyParser
|
from spacy.pipeline import DependencyParser
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
import numpy
|
import numpy
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.pipeline import AttributeRuler
|
from spacy.pipeline import AttributeRuler
|
||||||
from spacy import util, registry
|
from spacy import util, registry
|
||||||
|
|
|
@ -4,7 +4,7 @@ import pytest
|
||||||
from spacy.kb import KnowledgeBase, get_candidates, Candidate
|
from spacy.kb import KnowledgeBase, get_candidates, Candidate
|
||||||
|
|
||||||
from spacy import util, registry
|
from spacy import util, registry
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tests.util import make_tempdir
|
from spacy.tests.util import make_tempdir
|
||||||
from spacy.tokens import Span
|
from spacy.tokens import Span
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.tests.util import make_tempdir
|
from spacy.tests.util import make_tempdir
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.tests.util import make_tempdir
|
from spacy.tests.util import make_tempdir
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ from spacy.tokens import Doc
|
||||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
from ...gold import Example
|
from ...training import Example
|
||||||
|
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
import random
|
import random
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
from spacy.attrs import IS_PUNCT, ORTH, LOWER
|
from spacy.attrs import IS_PUNCT, ORTH, LOWER
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
|
@ -3,7 +3,7 @@ import gc
|
||||||
import numpy
|
import numpy
|
||||||
import copy
|
import copy
|
||||||
|
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.en.stop_words import STOP_WORDS
|
from spacy.lang.en.stop_words import STOP_WORDS
|
||||||
from spacy.lang.lex_attrs import is_stop
|
from spacy.lang.lex_attrs import is_stop
|
||||||
|
|
|
@ -3,7 +3,7 @@ import numpy
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
from spacy.displacy import render
|
from spacy.displacy import render
|
||||||
from spacy.gold import iob_to_biluo
|
from spacy.training import iob_to_biluo
|
||||||
from spacy.lang.it import Italian
|
from spacy.lang.it import Italian
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy import displacy
|
from spacy import displacy
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.ja import Japanese
|
from spacy.lang.ja import Japanese
|
||||||
from spacy.lang.xx import MultiLanguage
|
from spacy.lang.xx import MultiLanguage
|
||||||
|
|
|
@ -9,7 +9,7 @@ from spacy.tokens import Doc, Token
|
||||||
from spacy.matcher import Matcher, PhraseMatcher
|
from spacy.matcher import Matcher, PhraseMatcher
|
||||||
from spacy.errors import MatchPatternError
|
from spacy.errors import MatchPatternError
|
||||||
from spacy.util import minibatch
|
from spacy.util import minibatch
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.lang.hi import Hindi
|
from spacy.lang.hi import Hindi
|
||||||
from spacy.lang.es import Spanish
|
from spacy.lang.es import Spanish
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
|
@ -2,8 +2,8 @@ import pytest
|
||||||
from spacy.pipeline import Pipe
|
from spacy.pipeline import Pipe
|
||||||
from spacy.matcher import PhraseMatcher, Matcher
|
from spacy.matcher import PhraseMatcher, Matcher
|
||||||
from spacy.tokens import Doc, Span, DocBin
|
from spacy.tokens import Doc, Span, DocBin
|
||||||
from spacy.gold import Example, Corpus
|
from spacy.training import Example, Corpus
|
||||||
from spacy.gold.converters import json2docs
|
from spacy.training.converters import json2docs
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.util import minibatch, ensure_path, load_model
|
from spacy.util import minibatch, ensure_path, load_model
|
||||||
|
|
|
@ -1,9 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
from mock import Mock
|
|
||||||
from spacy.matcher import DependencyMatcher
|
|
||||||
from spacy.tokens import Doc, Span, DocBin
|
from spacy.tokens import Doc, Span, DocBin
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.gold.converters.conllu2docs import conllu2docs
|
from spacy.training.converters.conllu2docs import conllu2docs
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb import KnowledgeBase
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
@ -12,7 +10,7 @@ from spacy.util import ensure_path, load_model_from_path
|
||||||
import numpy
|
import numpy
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
from ..util import get_doc, make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
def test_issue4528(en_vocab):
|
def test_issue4528(en_vocab):
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
import pytest
|
import pytest
|
||||||
from click import NoSuchOption
|
from click import NoSuchOption
|
||||||
|
|
||||||
from spacy.gold import docs_to_json, biluo_tags_from_offsets
|
from spacy.training import docs_to_json, biluo_tags_from_offsets
|
||||||
from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
|
from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||||
from spacy.cli.pretrain import make_docs
|
from spacy.cli.pretrain import make_docs
|
||||||
|
|
|
@ -3,7 +3,7 @@ import pytest
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.util import registry
|
from spacy.util import registry
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.gold.example import Example
|
from spacy.training.example import Example
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from numpy.testing import assert_almost_equal, assert_array_almost_equal
|
from numpy.testing import assert_almost_equal, assert_array_almost_equal
|
||||||
import pytest
|
import pytest
|
||||||
from pytest import approx
|
from pytest import approx
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.gold.iob_utils import biluo_tags_from_offsets
|
from spacy.training.iob_utils import biluo_tags_from_offsets
|
||||||
from spacy.scorer import Scorer, ROCAUCScore
|
from spacy.scorer import Scorer, ROCAUCScore
|
||||||
from spacy.scorer import _roc_auc_score, _roc_curve
|
from spacy.scorer import _roc_auc_score, _roc_curve
|
||||||
from .util import get_doc
|
from .util import get_doc
|
||||||
|
|
|
@ -6,7 +6,7 @@ from spacy.ml.models.tok2vec import MishWindowEncoder, MaxoutWindowEncoder
|
||||||
from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener
|
from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from .util import get_batch
|
from .util import get_batch
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
import numpy
|
import numpy
|
||||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment
|
from spacy.training import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment
|
||||||
from spacy.gold import spans_from_biluo_tags, iob_to_biluo
|
from spacy.training import spans_from_biluo_tags, iob_to_biluo
|
||||||
from spacy.gold import Corpus, docs_to_json
|
from spacy.training import Corpus, docs_to_json
|
||||||
from spacy.gold.example import Example
|
from spacy.training.example import Example
|
||||||
from spacy.gold.converters import json2docs
|
from spacy.training.converters import json2docs
|
||||||
|
from spacy.training.augment import make_orth_variants_example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tokens import Doc, DocBin
|
from spacy.tokens import Doc, DocBin
|
||||||
from spacy.util import get_words_and_spaces, minibatch
|
from spacy.util import get_words_and_spaces, minibatch
|
||||||
|
@ -12,7 +13,6 @@ import pytest
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from .util import make_tempdir
|
from .util import make_tempdir
|
||||||
from ..gold.augment import make_orth_variants_example
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
|
@ -5,7 +5,7 @@ from .util import get_random_doc
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.util import dot_to_object, SimpleFrozenList
|
from spacy.util import dot_to_object, SimpleFrozenList
|
||||||
from thinc.api import Config, Optimizer
|
from thinc.api import Config, Optimizer
|
||||||
from spacy.gold.batchers import minibatch_by_words
|
from spacy.training.batchers import minibatch_by_words
|
||||||
from ..lang.en import English
|
from ..lang.en import English
|
||||||
from ..lang.nl import Dutch
|
from ..lang.nl import Dutch
|
||||||
from ..language import DEFAULT_CONFIG_PATH
|
from ..language import DEFAULT_CONFIG_PATH
|
||||||
|
|
|
@ -24,7 +24,7 @@ from .util import registry
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs
|
||||||
from .symbols import ORTH
|
from .symbols import ORTH
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .gold import validate_examples
|
from .training import validate_examples
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from .. import tags_to_entities
|
from .. import tags_to_entities
|
||||||
from ...gold import iob_to_biluo
|
from ...training import iob_to_biluo
|
||||||
from ...lang.xx import MultiLanguage
|
from ...lang.xx import MultiLanguage
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
from ...util import load_model
|
from ...util import load_model
|
|
@ -1,7 +1,7 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .conll_ner2docs import n_sents_info
|
from .conll_ner2docs import n_sents_info
|
||||||
from ...gold import iob_to_biluo, spans_from_biluo_tags
|
from ...training import iob_to_biluo, spans_from_biluo_tags
|
||||||
from ...tokens import Doc, Token, Span
|
from ...tokens import Doc, Token, Span
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
|
@ -1,7 +1,7 @@
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from .conll_ner2docs import n_sents_info
|
from .conll_ner2docs import n_sents_info
|
||||||
from ...gold import iob_to_biluo, tags_to_entities
|
from ...training import iob_to_biluo, tags_to_entities
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
from ...util import minibatch
|
from ...util import minibatch
|
||||||
|
|
|
@ -289,11 +289,11 @@ always be the **last element** in the row.
|
||||||
> | Column 1 | Column 2 ~~List[Doc]~~ |
|
> | Column 1 | Column 2 ~~List[Doc]~~ |
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. ~~Model[List[Doc], FullTransformerBatch]~~ |
|
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||||
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs and can set additional annotations on the `Doc`. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||||
|
|
||||||
### List {#list}
|
### List {#list}
|
||||||
|
|
||||||
|
|
|
@ -357,12 +357,12 @@ File /path/to/spacy/ml/models/tok2vec.py (line 207)
|
||||||
ℹ [training.logger]
|
ℹ [training.logger]
|
||||||
Registry @loggers
|
Registry @loggers
|
||||||
Name spacy.ConsoleLogger.v1
|
Name spacy.ConsoleLogger.v1
|
||||||
Module spacy.gold.loggers
|
Module spacy.training.loggers
|
||||||
File /path/to/spacy/gold/loggers.py (line 8)
|
File /path/to/spacy/gold/loggers.py (line 8)
|
||||||
ℹ [training.batcher]
|
ℹ [training.batcher]
|
||||||
Registry @batchers
|
Registry @batchers
|
||||||
Name spacy.batch_by_words.v1
|
Name spacy.batch_by_words.v1
|
||||||
Module spacy.gold.batchers
|
Module spacy.training.batchers
|
||||||
File /path/to/spacy/gold/batchers.py (line 49)
|
File /path/to/spacy/gold/batchers.py (line 49)
|
||||||
ℹ [training.batcher.size]
|
ℹ [training.batcher.size]
|
||||||
Registry @schedules
|
Registry @schedules
|
||||||
|
@ -372,7 +372,7 @@ File /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 43)
|
||||||
ℹ [training.dev_corpus]
|
ℹ [training.dev_corpus]
|
||||||
Registry @readers
|
Registry @readers
|
||||||
Name spacy.Corpus.v1
|
Name spacy.Corpus.v1
|
||||||
Module spacy.gold.corpus
|
Module spacy.training.corpus
|
||||||
File /path/to/spacy/gold/corpus.py (line 18)
|
File /path/to/spacy/gold/corpus.py (line 18)
|
||||||
ℹ [training.optimizer]
|
ℹ [training.optimizer]
|
||||||
Registry @optimizers
|
Registry @optimizers
|
||||||
|
@ -387,7 +387,7 @@ File /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 91)
|
||||||
ℹ [training.train_corpus]
|
ℹ [training.train_corpus]
|
||||||
Registry @readers
|
Registry @readers
|
||||||
Name spacy.Corpus.v1
|
Name spacy.Corpus.v1
|
||||||
Module spacy.gold.corpus
|
Module spacy.training.corpus
|
||||||
File /path/to/spacy/gold/corpus.py (line 18)
|
File /path/to/spacy/gold/corpus.py (line 18)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -58,7 +58,7 @@ train/test skew.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.gold import Corpus
|
> from spacy.training import Corpus
|
||||||
>
|
>
|
||||||
> # With a single file
|
> # With a single file
|
||||||
> corpus = Corpus("./data/train.spacy")
|
> corpus = Corpus("./data/train.spacy")
|
||||||
|
@ -82,7 +82,7 @@ Yield examples from the data.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.gold import Corpus
|
> from spacy.training import Corpus
|
||||||
> import spacy
|
> import spacy
|
||||||
>
|
>
|
||||||
> corpus = Corpus("./train.spacy")
|
> corpus = Corpus("./train.spacy")
|
||||||
|
|
|
@ -175,7 +175,7 @@ run [`spacy pretrain`](/api/cli#pretrain).
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.tokens import DocBin
|
> from spacy.tokens import DocBin
|
||||||
> from spacy.gold import Corpus
|
> from spacy.training import Corpus
|
||||||
>
|
>
|
||||||
> doc_bin = DocBin(docs=docs)
|
> doc_bin = DocBin(docs=docs)
|
||||||
> doc_bin.to_disk("./data.spacy")
|
> doc_bin.to_disk("./data.spacy")
|
||||||
|
|
|
@ -22,7 +22,7 @@ both documents.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.tokens import Doc
|
> from spacy.tokens import Doc
|
||||||
> from spacy.gold import Example
|
> from spacy.training import Example
|
||||||
>
|
>
|
||||||
> words = ["hello", "world", "!"]
|
> words = ["hello", "world", "!"]
|
||||||
> spaces = [True, False, False]
|
> spaces = [True, False, False]
|
||||||
|
@ -48,7 +48,7 @@ see the [training format documentation](/api/data-formats#dict-input).
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.tokens import Doc
|
> from spacy.tokens import Doc
|
||||||
> from spacy.gold import Example
|
> from spacy.training import Example
|
||||||
>
|
>
|
||||||
> predicted = Doc(vocab, words=["Apply", "some", "sunscreen"])
|
> predicted = Doc(vocab, words=["Apply", "some", "sunscreen"])
|
||||||
> token_ref = ["Apply", "some", "sun", "screen"]
|
> token_ref = ["Apply", "some", "sun", "screen"]
|
||||||
|
@ -301,7 +301,7 @@ tokenizations add up to the same string. For example, you'll be able to align
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.gold import Alignment
|
> from spacy.training import Alignment
|
||||||
>
|
>
|
||||||
> bert_tokens = ["obama", "'", "s", "podcast"]
|
> bert_tokens = ["obama", "'", "s", "podcast"]
|
||||||
> spacy_tokens = ["obama", "'s", "podcast"]
|
> spacy_tokens = ["obama", "'s", "podcast"]
|
||||||
|
|
|
@ -538,7 +538,7 @@ sequences in the batch.
|
||||||
|
|
||||||
## Training data and alignment {#gold source="spacy/gold"}
|
## Training data and alignment {#gold source="spacy/gold"}
|
||||||
|
|
||||||
### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"}
|
### training.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"}
|
||||||
|
|
||||||
Encode labelled spans into per-token tags, using the
|
Encode labelled spans into per-token tags, using the
|
||||||
[BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit,
|
[BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit,
|
||||||
|
@ -554,7 +554,7 @@ single-token entity.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.gold import biluo_tags_from_offsets
|
> from spacy.training import biluo_tags_from_offsets
|
||||||
>
|
>
|
||||||
> doc = nlp("I like London.")
|
> doc = nlp("I like London.")
|
||||||
> entities = [(7, 13, "LOC")]
|
> entities = [(7, 13, "LOC")]
|
||||||
|
@ -568,7 +568,7 @@ single-token entity.
|
||||||
| `entities` | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ |
|
| `entities` | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ |
|
||||||
| **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
|
| **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
|
||||||
|
|
||||||
### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"}
|
### training.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"}
|
||||||
|
|
||||||
Encode per-token tags following the
|
Encode per-token tags following the
|
||||||
[BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets.
|
[BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets.
|
||||||
|
@ -576,7 +576,7 @@ Encode per-token tags following the
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.gold import offsets_from_biluo_tags
|
> from spacy.training import offsets_from_biluo_tags
|
||||||
>
|
>
|
||||||
> doc = nlp("I like London.")
|
> doc = nlp("I like London.")
|
||||||
> tags = ["O", "O", "U-LOC", "O"]
|
> tags = ["O", "O", "U-LOC", "O"]
|
||||||
|
@ -590,7 +590,7 @@ Encode per-token tags following the
|
||||||
| `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
|
| `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
|
||||||
| **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ |
|
| **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ |
|
||||||
|
|
||||||
### gold.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"}
|
### training.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"}
|
||||||
|
|
||||||
Encode per-token tags following the
|
Encode per-token tags following the
|
||||||
[BILUO scheme](/usage/linguistic-features#accessing-ner) into
|
[BILUO scheme](/usage/linguistic-features#accessing-ner) into
|
||||||
|
@ -600,7 +600,7 @@ token-based tags, e.g. to overwrite the `doc.ents`.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.gold import spans_from_biluo_tags
|
> from spacy.training import spans_from_biluo_tags
|
||||||
>
|
>
|
||||||
> doc = nlp("I like London.")
|
> doc = nlp("I like London.")
|
||||||
> tags = ["O", "O", "U-LOC", "O"]
|
> tags = ["O", "O", "U-LOC", "O"]
|
||||||
|
|
|
@ -61,11 +61,11 @@ on the transformer architectures and their arguments and hyperparameters.
|
||||||
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
|
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
|
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
|
||||||
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
|
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
|
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
|
||||||
|
@ -102,14 +102,14 @@ attribute. You can also provide a callback to set additional annotations. In
|
||||||
your application, you would normally use a shortcut for this and instantiate the
|
your application, you would normally use a shortcut for this and instantiate the
|
||||||
component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
|
component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~ |
|
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||||
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs and stores the annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. By default, no additional annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs and stores the annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. By default, no additional annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~ |
|
| `max_batch_items` | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~ |
|
||||||
|
|
||||||
## Transformer.\_\_call\_\_ {#call tag="method"}
|
## Transformer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -205,7 +205,7 @@ modifying them.
|
||||||
|
|
||||||
Assign the extracted features to the Doc objects. By default, the
|
Assign the extracted features to the Doc objects. By default, the
|
||||||
[`TransformerData`](/api/transformer#transformerdata) object is written to the
|
[`TransformerData`](/api/transformer#transformerdata) object is written to the
|
||||||
[`Doc._.trf_data`](#custom-attributes) attribute. Your annotation_setter
|
[`Doc._.trf_data`](#custom-attributes) attribute. Your `set_extra_annotations`
|
||||||
callback is then called, if provided.
|
callback is then called, if provided.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
|
|
@ -253,10 +253,10 @@ for doc in nlp.pipe(["some text", "some other text"]):
|
||||||
|
|
||||||
You can also customize how the [`Transformer`](/api/transformer) component sets
|
You can also customize how the [`Transformer`](/api/transformer) component sets
|
||||||
annotations onto the [`Doc`](/api/doc), by specifying a custom
|
annotations onto the [`Doc`](/api/doc), by specifying a custom
|
||||||
`annotation_setter`. This callback will be called with the raw input and output
|
`set_extra_annotations` function. This callback will be called with the raw
|
||||||
data for the whole batch, along with the batch of `Doc` objects, allowing you to
|
input and output data for the whole batch, along with the batch of `Doc`
|
||||||
implement whatever you need. The annotation setter is called with a batch of
|
objects, allowing you to implement whatever you need. The annotation setter is
|
||||||
[`Doc`](/api/doc) objects and a
|
called with a batch of [`Doc`](/api/doc) objects and a
|
||||||
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) containing the
|
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) containing the
|
||||||
transformers data for the batch.
|
transformers data for the batch.
|
||||||
|
|
||||||
|
@ -267,7 +267,7 @@ def custom_annotation_setter(docs, trf_data):
|
||||||
doc._.custom_attr = data
|
doc._.custom_attr = data
|
||||||
|
|
||||||
nlp = spacy.load("en_core_trf_lg")
|
nlp = spacy.load("en_core_trf_lg")
|
||||||
nlp.get_pipe("transformer").annotation_setter = custom_annotation_setter
|
nlp.get_pipe("transformer").set_extra_annotations = custom_annotation_setter
|
||||||
doc = nlp("This is a text")
|
doc = nlp("This is a text")
|
||||||
assert isinstance(doc._.custom_attr, TransformerData)
|
assert isinstance(doc._.custom_attr, TransformerData)
|
||||||
print(doc._.custom_attr.tensors)
|
print(doc._.custom_attr.tensors)
|
||||||
|
@ -314,7 +314,7 @@ component:
|
||||||
> get_spans=get_doc_spans,
|
> get_spans=get_doc_spans,
|
||||||
> tokenizer_config={"use_fast": True},
|
> tokenizer_config={"use_fast": True},
|
||||||
> ),
|
> ),
|
||||||
> annotation_setter=null_annotation_setter,
|
> set_extra_annotations=null_annotation_setter,
|
||||||
> max_batch_items=4096,
|
> max_batch_items=4096,
|
||||||
> )
|
> )
|
||||||
> ```
|
> ```
|
||||||
|
@ -333,7 +333,7 @@ tokenizer_config = {"use_fast": true}
|
||||||
[components.transformer.model.get_spans]
|
[components.transformer.model.get_spans]
|
||||||
@span_getters = "spacy-transformers.doc_spans.v1"
|
@span_getters = "spacy-transformers.doc_spans.v1"
|
||||||
|
|
||||||
[components.transformer.annotation_setter]
|
[components.transformer.set_extra_annotations]
|
||||||
@annotation_setters = "spacy-transformers.null_annotation_setter.v1"
|
@annotation_setters = "spacy-transformers.null_annotation_setter.v1"
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -1366,7 +1366,7 @@ token.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
from spacy.gold import Alignment
|
from spacy.training import Alignment
|
||||||
|
|
||||||
other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."]
|
other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."]
|
||||||
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
|
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
|
||||||
|
|
|
@ -1500,7 +1500,7 @@ add those entities to the `doc.ents`, you can wrap it in a custom pipeline
|
||||||
component function and pass it the token texts from the `Doc` object received by
|
component function and pass it the token texts from the `Doc` object received by
|
||||||
the component.
|
the component.
|
||||||
|
|
||||||
The [`gold.spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags) is very
|
The [`training.spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags) is very
|
||||||
helpful here, because it takes a `Doc` object and token-based BILUO tags and
|
helpful here, because it takes a `Doc` object and token-based BILUO tags and
|
||||||
returns a sequence of `Span` objects in the `Doc` with added labels. So all your
|
returns a sequence of `Span` objects in the `Doc` with added labels. So all your
|
||||||
wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
|
wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
|
||||||
|
@ -1515,7 +1515,7 @@ wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
|
||||||
```python
|
```python
|
||||||
### {highlight="1,8-9"}
|
### {highlight="1,8-9"}
|
||||||
import your_custom_entity_recognizer
|
import your_custom_entity_recognizer
|
||||||
from spacy.gold import offsets_from_biluo_tags
|
from spacy.training import offsets_from_biluo_tags
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
|
|
||||||
@Language.component("custom_ner_wrapper")
|
@Language.component("custom_ner_wrapper")
|
||||||
|
|
|
@ -735,7 +735,7 @@ as **config settings** – in this case, `source`.
|
||||||
### functions.py {highlight="7-8"}
|
### functions.py {highlight="7-8"}
|
||||||
from typing import Callable, Iterator, List
|
from typing import Callable, Iterator, List
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
@ -783,7 +783,7 @@ annotations are the same.
|
||||||
### functions.py
|
### functions.py
|
||||||
from typing import Callable, Iterable, Iterator, List
|
from typing import Callable, Iterable, Iterator, List
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.gold import Example
|
from spacy.training import Example
|
||||||
|
|
||||||
@spacy.registry.batchers("filtering_batch.v1")
|
@spacy.registry.batchers("filtering_batch.v1")
|
||||||
def filter_batch(size: int) -> Callable[[Iterable[Example]], Iterator[List[Example]]]:
|
def filter_batch(size: int) -> Callable[[Iterable[Example]], Iterator[List[Example]]]:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user