mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Update docstrings, docs and types
This commit is contained in:
parent
7adffc5361
commit
e0ffe36e79
|
@ -1,7 +1,15 @@
|
||||||
|
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING
|
||||||
|
from pathlib import Path
|
||||||
import random
|
import random
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from .example import Example
|
from .example import Example
|
||||||
from ..tokens import DocBin, Doc
|
from ..tokens import DocBin, Doc
|
||||||
|
from ..vocab import Vocab
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
|
from ..language import Language # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
class Corpus:
|
class Corpus:
|
||||||
|
@ -11,20 +19,23 @@ class Corpus:
|
||||||
DOCS: https://spacy.io/api/corpus
|
DOCS: https://spacy.io/api/corpus
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, train_loc, dev_loc, limit=0):
|
def __init__(
|
||||||
|
self, train_loc: Union[str, Path], dev_loc: Union[str, Path], limit: int = 0
|
||||||
|
) -> None:
|
||||||
"""Create a Corpus.
|
"""Create a Corpus.
|
||||||
|
|
||||||
train (str / Path): File or directory of training data.
|
train (str / Path): File or directory of training data.
|
||||||
dev (str / Path): File or directory of development data.
|
dev (str / Path): File or directory of development data.
|
||||||
limit (int): Max. number of examples returned
|
limit (int): Max. number of examples returned.
|
||||||
RETURNS (Corpus): The newly created object.
|
|
||||||
|
DOCS: https://spacy.io/api/corpus#init
|
||||||
"""
|
"""
|
||||||
self.train_loc = train_loc
|
self.train_loc = train_loc
|
||||||
self.dev_loc = dev_loc
|
self.dev_loc = dev_loc
|
||||||
self.limit = limit
|
self.limit = limit
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def walk_corpus(path):
|
def walk_corpus(path: Union[str, Path]) -> List[Path]:
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
if not path.is_dir():
|
if not path.is_dir():
|
||||||
return [path]
|
return [path]
|
||||||
|
@ -43,7 +54,9 @@ class Corpus:
|
||||||
locs.append(path)
|
locs.append(path)
|
||||||
return locs
|
return locs
|
||||||
|
|
||||||
def _make_example(self, nlp, reference, gold_preproc):
|
def _make_example(
|
||||||
|
self, nlp: "Language", reference: Doc, gold_preproc: bool
|
||||||
|
) -> Example:
|
||||||
if gold_preproc or reference.has_unknown_spaces:
|
if gold_preproc or reference.has_unknown_spaces:
|
||||||
return Example(
|
return Example(
|
||||||
Doc(
|
Doc(
|
||||||
|
@ -56,7 +69,9 @@ class Corpus:
|
||||||
else:
|
else:
|
||||||
return Example(nlp.make_doc(reference.text), reference)
|
return Example(nlp.make_doc(reference.text), reference)
|
||||||
|
|
||||||
def make_examples(self, nlp, reference_docs, max_length=0):
|
def make_examples(
|
||||||
|
self, nlp: "Language", reference_docs: Iterable[Doc], max_length: int = 0
|
||||||
|
) -> Iterator[Example]:
|
||||||
for reference in reference_docs:
|
for reference in reference_docs:
|
||||||
if len(reference) == 0:
|
if len(reference) == 0:
|
||||||
continue
|
continue
|
||||||
|
@ -69,7 +84,9 @@ class Corpus:
|
||||||
elif max_length == 0 or len(ref_sent) < max_length:
|
elif max_length == 0 or len(ref_sent) < max_length:
|
||||||
yield self._make_example(nlp, ref_sent.as_doc(), False)
|
yield self._make_example(nlp, ref_sent.as_doc(), False)
|
||||||
|
|
||||||
def make_examples_gold_preproc(self, nlp, reference_docs):
|
def make_examples_gold_preproc(
|
||||||
|
self, nlp: "Language", reference_docs: Iterable[Doc]
|
||||||
|
) -> Iterator[Example]:
|
||||||
for reference in reference_docs:
|
for reference in reference_docs:
|
||||||
if reference.is_sentenced:
|
if reference.is_sentenced:
|
||||||
ref_sents = [sent.as_doc() for sent in reference.sents]
|
ref_sents = [sent.as_doc() for sent in reference.sents]
|
||||||
|
@ -80,7 +97,9 @@ class Corpus:
|
||||||
if len(eg.x):
|
if len(eg.x):
|
||||||
yield eg
|
yield eg
|
||||||
|
|
||||||
def read_docbin(self, vocab, locs):
|
def read_docbin(
|
||||||
|
self, vocab: Vocab, locs: Iterable[Union[str, Path]]
|
||||||
|
) -> Iterator[Doc]:
|
||||||
""" Yield training examples as example dicts """
|
""" Yield training examples as example dicts """
|
||||||
i = 0
|
i = 0
|
||||||
for loc in locs:
|
for loc in locs:
|
||||||
|
@ -96,8 +115,14 @@ class Corpus:
|
||||||
if self.limit >= 1 and i >= self.limit:
|
if self.limit >= 1 and i >= self.limit:
|
||||||
break
|
break
|
||||||
|
|
||||||
def count_train(self, nlp):
|
def count_train(self, nlp: "Language") -> int:
|
||||||
"""Returns count of words in train examples"""
|
"""Returns count of words in train examples.
|
||||||
|
|
||||||
|
nlp (Language): The current nlp. object.
|
||||||
|
RETURNS (int): The word count.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/corpus#count_train
|
||||||
|
"""
|
||||||
n = 0
|
n = 0
|
||||||
i = 0
|
i = 0
|
||||||
for example in self.train_dataset(nlp):
|
for example in self.train_dataset(nlp):
|
||||||
|
@ -108,8 +133,25 @@ class Corpus:
|
||||||
return n
|
return n
|
||||||
|
|
||||||
def train_dataset(
|
def train_dataset(
|
||||||
self, nlp, *, shuffle=True, gold_preproc=False, max_length=0, **kwargs
|
self,
|
||||||
):
|
nlp: "Language",
|
||||||
|
*,
|
||||||
|
shuffle: bool = True,
|
||||||
|
gold_preproc: bool = False,
|
||||||
|
max_length: int = 0
|
||||||
|
) -> Iterator[Example]:
|
||||||
|
"""Yield examples from the training data.
|
||||||
|
|
||||||
|
nlp (Language): The current nlp object.
|
||||||
|
shuffle (bool): Whether to shuffle the examples.
|
||||||
|
gold_preproc (bool): Whether to train on gold-standard sentences and tokens.
|
||||||
|
max_length (int): Maximum document length. Longer documents will be
|
||||||
|
split into sentences, if sentence boundaries are available. 0 for
|
||||||
|
no limit.
|
||||||
|
YIELDS (Example): The examples.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/corpus#train_dataset
|
||||||
|
"""
|
||||||
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
|
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
|
||||||
if gold_preproc:
|
if gold_preproc:
|
||||||
examples = self.make_examples_gold_preproc(nlp, ref_docs)
|
examples = self.make_examples_gold_preproc(nlp, ref_docs)
|
||||||
|
@ -120,7 +162,17 @@ class Corpus:
|
||||||
random.shuffle(examples)
|
random.shuffle(examples)
|
||||||
yield from examples
|
yield from examples
|
||||||
|
|
||||||
def dev_dataset(self, nlp, *, gold_preproc=False, **kwargs):
|
def dev_dataset(
|
||||||
|
self, nlp: "Language", *, gold_preproc: bool = False
|
||||||
|
) -> Iterator[Example]:
|
||||||
|
"""Yield examples from the development data.
|
||||||
|
|
||||||
|
nlp (Language): The current nlp object.
|
||||||
|
gold_preproc (bool): Whether to train on gold-standard sentences and tokens.
|
||||||
|
YIELDS (Example): The examples.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/corpus#dev_dataset
|
||||||
|
"""
|
||||||
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc))
|
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc))
|
||||||
if gold_preproc:
|
if gold_preproc:
|
||||||
examples = self.make_examples_gold_preproc(nlp, ref_docs)
|
examples = self.make_examples_gold_preproc(nlp, ref_docs)
|
||||||
|
|
|
@ -21,7 +21,6 @@ class Lemmatizer:
|
||||||
|
|
||||||
lookups (Lookups): The lookups object containing the (optional) tables
|
lookups (Lookups): The lookups object containing the (optional) tables
|
||||||
"lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
|
"lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
|
||||||
RETURNS (Lemmatizer): The newly constructed object.
|
|
||||||
"""
|
"""
|
||||||
self.lookups = lookups if lookups is not None else Lookups()
|
self.lookups = lookups if lookups is not None else Lookups()
|
||||||
self.is_base_form = is_base_form
|
self.is_base_form = is_base_form
|
||||||
|
|
|
@ -52,8 +52,6 @@ class Lookups:
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
"""Initialize the Lookups object.
|
"""Initialize the Lookups object.
|
||||||
|
|
||||||
RETURNS (Lookups): The newly created object.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#init
|
DOCS: https://spacy.io/api/lookups#init
|
||||||
"""
|
"""
|
||||||
self._tables = {}
|
self._tables = {}
|
||||||
|
@ -202,7 +200,6 @@ class Table(OrderedDict):
|
||||||
|
|
||||||
data (dict): The dictionary.
|
data (dict): The dictionary.
|
||||||
name (str): Optional table name for reference.
|
name (str): Optional table name for reference.
|
||||||
RETURNS (Table): The newly created object.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#table.from_dict
|
DOCS: https://spacy.io/api/lookups#table.from_dict
|
||||||
"""
|
"""
|
||||||
|
@ -215,7 +212,6 @@ class Table(OrderedDict):
|
||||||
|
|
||||||
name (str): Optional table name for reference.
|
name (str): Optional table name for reference.
|
||||||
data (dict): Initial data, used to hint Bloom Filter.
|
data (dict): Initial data, used to hint Bloom Filter.
|
||||||
RETURNS (Table): The newly created object.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#table.init
|
DOCS: https://spacy.io/api/lookups#table.init
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -36,7 +36,6 @@ cdef class DependencyMatcher:
|
||||||
|
|
||||||
vocab (Vocab): The vocabulary object, which must be shared with the
|
vocab (Vocab): The vocabulary object, which must be shared with the
|
||||||
documents the matcher will operate on.
|
documents the matcher will operate on.
|
||||||
RETURNS (DependencyMatcher): The newly constructed object.
|
|
||||||
"""
|
"""
|
||||||
size = 20
|
size = 20
|
||||||
# TODO: make matcher work with validation
|
# TODO: make matcher work with validation
|
||||||
|
|
|
@ -37,7 +37,6 @@ cdef class Matcher:
|
||||||
|
|
||||||
vocab (Vocab): The vocabulary object, which must be shared with the
|
vocab (Vocab): The vocabulary object, which must be shared with the
|
||||||
documents the matcher will operate on.
|
documents the matcher will operate on.
|
||||||
RETURNS (Matcher): The newly constructed object.
|
|
||||||
"""
|
"""
|
||||||
self._extra_predicates = []
|
self._extra_predicates = []
|
||||||
self._patterns = {}
|
self._patterns = {}
|
||||||
|
|
|
@ -32,7 +32,6 @@ cdef class PhraseMatcher:
|
||||||
vocab (Vocab): The shared vocabulary.
|
vocab (Vocab): The shared vocabulary.
|
||||||
attr (int / str): Token attribute to match on.
|
attr (int / str): Token attribute to match on.
|
||||||
validate (bool): Perform additional validation when patterns are added.
|
validate (bool): Perform additional validation when patterns are added.
|
||||||
RETURNS (PhraseMatcher): The newly constructed object.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#init
|
DOCS: https://spacy.io/api/phrasematcher#init
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -86,7 +86,6 @@ class EntityRuler:
|
||||||
overwrite_ents (bool): If existing entities are present, e.g. entities
|
overwrite_ents (bool): If existing entities are present, e.g. entities
|
||||||
added by the model, overwrite them by matches if necessary.
|
added by the model, overwrite them by matches if necessary.
|
||||||
ent_id_sep (str): Separator used internally for entity IDs.
|
ent_id_sep (str): Separator used internally for entity IDs.
|
||||||
RETURNS (EntityRuler): The newly constructed object.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#init
|
DOCS: https://spacy.io/api/entityruler#init
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -72,7 +72,6 @@ class Scorer:
|
||||||
|
|
||||||
def __init__(self, nlp=None, **cfg):
|
def __init__(self, nlp=None, **cfg):
|
||||||
"""Initialize the Scorer.
|
"""Initialize the Scorer.
|
||||||
RETURNS (Scorer): The newly created object.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#init
|
DOCS: https://spacy.io/api/scorer#init
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -97,7 +97,6 @@ cdef class StringStore:
|
||||||
"""Create the StringStore.
|
"""Create the StringStore.
|
||||||
|
|
||||||
strings (iterable): A sequence of unicode strings to add to the store.
|
strings (iterable): A sequence of unicode strings to add to the store.
|
||||||
RETURNS (StringStore): The newly constructed object.
|
|
||||||
"""
|
"""
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._map = PreshMap()
|
self._map = PreshMap()
|
||||||
|
|
|
@ -50,7 +50,6 @@ cdef class Tokenizer:
|
||||||
recognised as tokens.
|
recognised as tokens.
|
||||||
url_match (callable): A boolean function matching strings to be
|
url_match (callable): A boolean function matching strings to be
|
||||||
recognised as tokens after considering prefixes and suffixes.
|
recognised as tokens after considering prefixes and suffixes.
|
||||||
RETURNS (Tokenizer): The newly constructed object.
|
|
||||||
|
|
||||||
EXAMPLE:
|
EXAMPLE:
|
||||||
>>> tokenizer = Tokenizer(nlp.vocab)
|
>>> tokenizer = Tokenizer(nlp.vocab)
|
||||||
|
|
|
@ -312,6 +312,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
||||||
"""Retokenize the document, such that the token at
|
"""Retokenize the document, such that the token at
|
||||||
`doc[token_index]` is split into tokens with the orth 'orths'
|
`doc[token_index]` is split into tokens with the orth 'orths'
|
||||||
token_index(int): token index of the token to split.
|
token_index(int): token index of the token to split.
|
||||||
|
|
||||||
orths: IDs of the verbatim text content of the tokens to create
|
orths: IDs of the verbatim text content of the tokens to create
|
||||||
**attributes: Attributes to assign to each of the newly created tokens. By default,
|
**attributes: Attributes to assign to each of the newly created tokens. By default,
|
||||||
attributes are inherited from the original token.
|
attributes are inherited from the original token.
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
|
from typing import Iterable, Iterator
|
||||||
import numpy
|
import numpy
|
||||||
import zlib
|
import zlib
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import NumpyOps
|
from thinc.api import NumpyOps
|
||||||
|
|
||||||
|
from .doc import Doc
|
||||||
|
from ..vocab import Vocab
|
||||||
from ..compat import copy_reg
|
from ..compat import copy_reg
|
||||||
from ..tokens import Doc
|
|
||||||
from ..attrs import SPACY, ORTH, intify_attr
|
from ..attrs import SPACY, ORTH, intify_attr
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
|
@ -44,13 +46,18 @@ class DocBin:
|
||||||
document from the DocBin.
|
document from the DocBin.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, attrs=ALL_ATTRS, store_user_data=False, docs=[]):
|
def __init__(
|
||||||
|
self,
|
||||||
|
attrs: Iterable[str] = ALL_ATTRS,
|
||||||
|
store_user_data: bool = False,
|
||||||
|
docs=Iterable[Doc],
|
||||||
|
) -> None:
|
||||||
"""Create a DocBin object to hold serialized annotations.
|
"""Create a DocBin object to hold serialized annotations.
|
||||||
|
|
||||||
attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
|
attrs (Iterable[str]): List of attributes to serialize. 'orth' and
|
||||||
always serialized, so they're not required. Defaults to None.
|
'spacy' are always serialized, so they're not required.
|
||||||
store_user_data (bool): Whether to include the `Doc.user_data`.
|
store_user_data (bool): Whether to include the `Doc.user_data`.
|
||||||
RETURNS (DocBin): The newly constructed object.
|
docs (Iterable[Doc]): Docs to add.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/docbin#init
|
DOCS: https://spacy.io/api/docbin#init
|
||||||
"""
|
"""
|
||||||
|
@ -68,11 +75,11 @@ class DocBin:
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
self.add(doc)
|
self.add(doc)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self) -> int:
|
||||||
"""RETURNS: The number of Doc objects added to the DocBin."""
|
"""RETURNS: The number of Doc objects added to the DocBin."""
|
||||||
return len(self.tokens)
|
return len(self.tokens)
|
||||||
|
|
||||||
def add(self, doc):
|
def add(self, doc: Doc) -> None:
|
||||||
"""Add a Doc's annotations to the DocBin for serialization.
|
"""Add a Doc's annotations to the DocBin for serialization.
|
||||||
|
|
||||||
doc (Doc): The Doc object to add.
|
doc (Doc): The Doc object to add.
|
||||||
|
@ -100,7 +107,7 @@ class DocBin:
|
||||||
if self.store_user_data:
|
if self.store_user_data:
|
||||||
self.user_data.append(srsly.msgpack_dumps(doc.user_data))
|
self.user_data.append(srsly.msgpack_dumps(doc.user_data))
|
||||||
|
|
||||||
def get_docs(self, vocab):
|
def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
|
||||||
"""Recover Doc objects from the annotations, using the given vocab.
|
"""Recover Doc objects from the annotations, using the given vocab.
|
||||||
|
|
||||||
vocab (Vocab): The shared vocab.
|
vocab (Vocab): The shared vocab.
|
||||||
|
@ -125,7 +132,7 @@ class DocBin:
|
||||||
doc.user_data.update(user_data)
|
doc.user_data.update(user_data)
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def merge(self, other):
|
def merge(self, other: "DocBin") -> None:
|
||||||
"""Extend the annotations of this DocBin with the annotations from
|
"""Extend the annotations of this DocBin with the annotations from
|
||||||
another. Will raise an error if the pre-defined attrs of the two
|
another. Will raise an error if the pre-defined attrs of the two
|
||||||
DocBins don't match.
|
DocBins don't match.
|
||||||
|
@ -144,7 +151,7 @@ class DocBin:
|
||||||
if self.store_user_data:
|
if self.store_user_data:
|
||||||
self.user_data.extend(other.user_data)
|
self.user_data.extend(other.user_data)
|
||||||
|
|
||||||
def to_bytes(self):
|
def to_bytes(self) -> bytes:
|
||||||
"""Serialize the DocBin's annotations to a bytestring.
|
"""Serialize the DocBin's annotations to a bytestring.
|
||||||
|
|
||||||
RETURNS (bytes): The serialized DocBin.
|
RETURNS (bytes): The serialized DocBin.
|
||||||
|
@ -156,7 +163,6 @@ class DocBin:
|
||||||
lengths = [len(tokens) for tokens in self.tokens]
|
lengths = [len(tokens) for tokens in self.tokens]
|
||||||
tokens = numpy.vstack(self.tokens) if self.tokens else numpy.asarray([])
|
tokens = numpy.vstack(self.tokens) if self.tokens else numpy.asarray([])
|
||||||
spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
|
spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
|
||||||
|
|
||||||
msg = {
|
msg = {
|
||||||
"version": self.version,
|
"version": self.version,
|
||||||
"attrs": self.attrs,
|
"attrs": self.attrs,
|
||||||
|
@ -171,7 +177,7 @@ class DocBin:
|
||||||
msg["user_data"] = self.user_data
|
msg["user_data"] = self.user_data
|
||||||
return zlib.compress(srsly.msgpack_dumps(msg))
|
return zlib.compress(srsly.msgpack_dumps(msg))
|
||||||
|
|
||||||
def from_bytes(self, bytes_data):
|
def from_bytes(self, bytes_data: bytes) -> "DocBin":
|
||||||
"""Deserialize the DocBin's annotations from a bytestring.
|
"""Deserialize the DocBin's annotations from a bytestring.
|
||||||
|
|
||||||
bytes_data (bytes): The data to load from.
|
bytes_data (bytes): The data to load from.
|
||||||
|
|
|
@ -173,7 +173,6 @@ cdef class Doc:
|
||||||
words. True means that the word is followed by a space, False means
|
words. True means that the word is followed by a space, False means
|
||||||
it is not. If `None`, defaults to `[True]*len(words)`
|
it is not. If `None`, defaults to `[True]*len(words)`
|
||||||
user_data (dict or None): Optional extra data to attach to the Doc.
|
user_data (dict or None): Optional extra data to attach to the Doc.
|
||||||
RETURNS (Doc): The newly constructed object.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#init
|
DOCS: https://spacy.io/api/doc#init
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -94,7 +94,6 @@ cdef class Span:
|
||||||
kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity.
|
kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity.
|
||||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation
|
||||||
of the span.
|
of the span.
|
||||||
RETURNS (Span): The newly constructed object.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/span#init
|
DOCS: https://spacy.io/api/span#init
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -58,7 +58,6 @@ cdef class Vectors:
|
||||||
data (numpy.ndarray): The vector data.
|
data (numpy.ndarray): The vector data.
|
||||||
keys (iterable): A sequence of keys, aligned with the data.
|
keys (iterable): A sequence of keys, aligned with the data.
|
||||||
name (str): A name to identify the vectors table.
|
name (str): A name to identify the vectors table.
|
||||||
RETURNS (Vectors): The newly created object.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#init
|
DOCS: https://spacy.io/api/vectors#init
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -74,7 +74,6 @@ cdef class Vocab:
|
||||||
lookups (Lookups): Container for large lookup tables and dictionaries.
|
lookups (Lookups): Container for large lookup tables and dictionaries.
|
||||||
oov_prob (float): Default OOV probability.
|
oov_prob (float): Default OOV probability.
|
||||||
vectors_name (unicode): Optional name to identify the vectors table.
|
vectors_name (unicode): Optional name to identify the vectors table.
|
||||||
RETURNS (Vocab): The newly constructed object.
|
|
||||||
"""
|
"""
|
||||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||||
if lookups in (None, True, False):
|
if lookups in (None, True, False):
|
||||||
|
|
|
@ -4,6 +4,7 @@ teaser: Pre-defined model architectures included with the core library
|
||||||
source: spacy/ml/models
|
source: spacy/ml/models
|
||||||
menu:
|
menu:
|
||||||
- ['Tok2Vec', 'tok2vec']
|
- ['Tok2Vec', 'tok2vec']
|
||||||
|
- ['Transformers', 'transformers']
|
||||||
- ['Parser & NER', 'parser']
|
- ['Parser & NER', 'parser']
|
||||||
- ['Text Classification', 'textcat']
|
- ['Text Classification', 'textcat']
|
||||||
- ['Entity Linking', 'entitylinker']
|
- ['Entity Linking', 'entitylinker']
|
||||||
|
@ -13,7 +14,7 @@ TODO: intro and how architectures work, link to
|
||||||
[`registry`](/api/top-level#registry),
|
[`registry`](/api/top-level#registry),
|
||||||
[custom models](/usage/training#custom-models) usage etc.
|
[custom models](/usage/training#custom-models) usage etc.
|
||||||
|
|
||||||
## Tok2Vec architectures {#tok2vec source="spacy/ml/models/tok2vec.py"}}
|
## Tok2Vec architectures {#tok2vec source="spacy/ml/models/tok2vec.py"}
|
||||||
|
|
||||||
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
|
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
|
||||||
|
|
||||||
|
@ -21,12 +22,14 @@ TODO: intro and how architectures work, link to
|
||||||
|
|
||||||
### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM}
|
### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM}
|
||||||
|
|
||||||
|
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
|
||||||
|
|
||||||
|
### spacy-transformers.TransformerModel.v1 {#TransformerModel}
|
||||||
|
|
||||||
## Parser & NER architectures {#parser source="spacy/ml/models/parser.py"}
|
## Parser & NER architectures {#parser source="spacy/ml/models/parser.py"}
|
||||||
|
|
||||||
### spacy.TransitionBasedParser.v1 {#TransitionBasedParser}
|
### spacy.TransitionBasedParser.v1 {#TransitionBasedParser}
|
||||||
|
|
||||||
<!-- TODO: intro -->
|
|
||||||
|
|
||||||
> #### Example Config
|
> #### Example Config
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
|
|
|
@ -13,25 +13,84 @@ datasets in the [DocBin](/api/docbin) (`.spacy`) format.
|
||||||
|
|
||||||
Create a `Corpus`. The input data can be a file or a directory of files.
|
Create a `Corpus`. The input data can be a file or a directory of files.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.gold import Corpus
|
||||||
|
>
|
||||||
|
> corpus = Corpus("./train.spacy", "./dev.spacy")
|
||||||
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------ | ---------------------------------------------------------------- |
|
| ------- | ------------ | ---------------------------------------------------------------- |
|
||||||
| `train` | str / `Path` | Training data (`.spacy` file or directory of `.spacy` files). |
|
| `train` | str / `Path` | Training data (`.spacy` file or directory of `.spacy` files). |
|
||||||
| `dev` | str / `Path` | Development data (`.spacy` file or directory of `.spacy` files). |
|
| `dev` | str / `Path` | Development data (`.spacy` file or directory of `.spacy` files). |
|
||||||
| `limit` | int | Maximum number of examples returned. |
|
| `limit` | int | Maximum number of examples returned. `0` for no limit (default). |
|
||||||
| **RETURNS** | `Corpus` | The newly constructed object. |
|
|
||||||
|
|
||||||
<!-- TODO: document remaining methods / decide which to document -->
|
|
||||||
|
|
||||||
## Corpus.walk_corpus {#walk_corpus tag="staticmethod"}
|
|
||||||
|
|
||||||
## Corpus.make_examples {#make_examples tag="method"}
|
|
||||||
|
|
||||||
## Corpus.make_examples_gold_preproc {#make_examples_gold_preproc tag="method"}
|
|
||||||
|
|
||||||
## Corpus.read_docbin {#read_docbin tag="method"}
|
|
||||||
|
|
||||||
## Corpus.count_train {#count_train tag="method"}
|
|
||||||
|
|
||||||
## Corpus.train_dataset {#train_dataset tag="method"}
|
## Corpus.train_dataset {#train_dataset tag="method"}
|
||||||
|
|
||||||
|
Yield examples from the training data.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.gold import Corpus
|
||||||
|
> import spacy
|
||||||
|
>
|
||||||
|
> corpus = Corpus("./train.spacy", "./dev.spacy")
|
||||||
|
> nlp = spacy.blank("en")
|
||||||
|
> train_data = corpus.train_dataset(nlp)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| `nlp` | `Language` | The current `nlp` object. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `shuffle` | bool | Whether to shuffle the examples. Defaults to `True`. |
|
||||||
|
| `gold_preproc` | bool | Whether to train on gold-standard sentences and tokens. Defaults to `False`. |
|
||||||
|
| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. `0` for no limit (default). |
|
||||||
|
| **YIELDS** | `Example` | The examples. |
|
||||||
|
|
||||||
## Corpus.dev_dataset {#dev_dataset tag="method"}
|
## Corpus.dev_dataset {#dev_dataset tag="method"}
|
||||||
|
|
||||||
|
Yield examples from the development data.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.gold import Corpus
|
||||||
|
> import spacy
|
||||||
|
>
|
||||||
|
> corpus = Corpus("./train.spacy", "./dev.spacy")
|
||||||
|
> nlp = spacy.blank("en")
|
||||||
|
> dev_data = corpus.dev_dataset(nlp)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | ---------- | ---------------------------------------------------------------------------- |
|
||||||
|
| `nlp` | `Language` | The current `nlp` object. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `gold_preproc` | bool | Whether to train on gold-standard sentences and tokens. Defaults to `False`. |
|
||||||
|
| **YIELDS** | `Example` | The examples. |
|
||||||
|
|
||||||
|
## Corpus.count_train {#count_train tag="method"}
|
||||||
|
|
||||||
|
Get the word count of all training examples.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.gold import Corpus
|
||||||
|
> import spacy
|
||||||
|
>
|
||||||
|
> corpus = Corpus("./train.spacy", "./dev.spacy")
|
||||||
|
> nlp = spacy.blank("en")
|
||||||
|
> word_count = corpus.count_train(nlp)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ---------- | ------------------------- |
|
||||||
|
| `nlp` | `Language` | The current `nlp` object. |
|
||||||
|
| **RETURNS** | int | The word count. |
|
||||||
|
|
||||||
|
<!-- TODO: document remaining methods? / decide which to document -->
|
||||||
|
|
|
@ -88,12 +88,11 @@ Create a `Token` object from a `TokenC*` pointer.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | --------- | ------------------------------------------------------------ |
|
| -------- | --------- | ------------------------------------------------------------ |
|
||||||
| `vocab` | `Vocab` | A reference to the shared `Vocab`. |
|
| `vocab` | `Vocab` | A reference to the shared `Vocab`. |
|
||||||
| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc)struct. |
|
| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc)struct. |
|
||||||
| `offset` | `int` | The offset of the token within the document. |
|
| `offset` | `int` | The offset of the token within the document. |
|
||||||
| `doc` | `Doc` | The parent document. |
|
| `doc` | `Doc` | The parent document. |
|
||||||
| **RETURNS** | `Token` | The newly constructed object. |
|
|
||||||
|
|
||||||
## Span {#span tag="cdef class" source="spacy/tokens/span.pxd"}
|
## Span {#span tag="cdef class" source="spacy/tokens/span.pxd"}
|
||||||
|
|
||||||
|
|
|
@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
|
||||||
|
|
||||||
## DependencyParser.begin_training {#begin_training tag="method"}
|
## DependencyParser.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Return an
|
Initialize the pipe for training, using data examples if available. Returns an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
|
|
@ -31,11 +31,10 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | A storage container for lexical types. |
|
| `vocab` | `Vocab` | A storage container for lexical types. |
|
||||||
| `words` | iterable | A list of strings to add to the container. |
|
| `words` | iterable | A list of strings to add to the container. |
|
||||||
| `spaces` | iterable | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. |
|
| `spaces` | iterable | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. |
|
||||||
| **RETURNS** | `Doc` | The newly constructed object. |
|
|
||||||
|
|
||||||
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -45,10 +45,10 @@ Create a `DocBin` object to hold serialized annotations.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ----------------- | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `attrs` | list | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. |
|
| `attrs` | `Iterable[str]` | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. |
|
||||||
| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. |
|
| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. |
|
||||||
| **RETURNS** | `DocBin` | The newly constructed object. |
|
| `docs` | `Iterable[Doc]` | `Doc` objects to add on initialization. |
|
||||||
|
|
||||||
## DocBin.\_\len\_\_ {#len tag="method"}
|
## DocBin.\_\len\_\_ {#len tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
|
||||||
|
|
||||||
## EntityLinker.begin_training {#begin_training tag="method"}
|
## EntityLinker.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Return an
|
Initialize the pipe for training, using data examples if available. Returns an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this
|
||||||
method, a knowledge base should have been defined with
|
method, a knowledge base should have been defined with
|
||||||
[`set_kb`](/api/entitylinker#set_kb).
|
[`set_kb`](/api/entitylinker#set_kb).
|
||||||
|
|
|
@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
|
||||||
|
|
||||||
## EntityRecognizer.begin_training {#begin_training tag="method"}
|
## EntityRecognizer.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Return an
|
Initialize the pipe for training, using data examples if available. Returns an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
|
|
@ -37,7 +37,6 @@ both documents.
|
||||||
| `reference` | `Doc` | The document containing gold-standard annotations. Can not be `None`. |
|
| `reference` | `Doc` | The document containing gold-standard annotations. Can not be `None`. |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `alignment` | `Alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. |
|
| `alignment` | `Alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. |
|
||||||
| **RETURNS** | `Example` | The newly constructed object. |
|
|
||||||
|
|
||||||
## Example.from_dict {#from_dict tag="classmethod"}
|
## Example.from_dict {#from_dict tag="classmethod"}
|
||||||
|
|
||||||
|
|
|
@ -28,10 +28,9 @@ Create the knowledge base.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ---------------------- | --------------- | ---------------------------------------- |
|
| ---------------------- | ------- | ---------------------------------------- |
|
||||||
| `vocab` | `Vocab` | A `Vocab` object. |
|
| `vocab` | `Vocab` | A `Vocab` object. |
|
||||||
| `entity_vector_length` | int | Length of the fixed-size entity vectors. |
|
| `entity_vector_length` | int | Length of the fixed-size entity vectors. |
|
||||||
| **RETURNS** | `KnowledgeBase` | The newly constructed object. |
|
|
||||||
|
|
||||||
## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}
|
## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}
|
||||||
|
|
||||||
|
@ -255,7 +254,6 @@ but instead these objects are returned by the
|
||||||
| `entity_freq` | float | The entity frequency as recorded in the KB. |
|
| `entity_freq` | float | The entity frequency as recorded in the KB. |
|
||||||
| `alias_hash` | int | The hash of the textual mention or alias. |
|
| `alias_hash` | int | The hash of the textual mention or alias. |
|
||||||
| `prior_prob` | float | The prior probability of the `alias` referring to the `entity` |
|
| `prior_prob` | float | The prior probability of the `alias` referring to the `entity` |
|
||||||
| **RETURNS** | `Candidate` | The newly constructed object. |
|
|
||||||
|
|
||||||
## Candidate attributes {#candidate_attributes}
|
## Candidate attributes {#candidate_attributes}
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,58 @@ the tagger or parser that are called on a document in order. You can also add
|
||||||
your own processing pipeline components that take a `Doc` object, modify it and
|
your own processing pipeline components that take a `Doc` object, modify it and
|
||||||
return it.
|
return it.
|
||||||
|
|
||||||
|
## Language.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
Initialize a `Language` object.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> # Construction from subclass
|
||||||
|
> from spacy.lang.en import English
|
||||||
|
> nlp = English()
|
||||||
|
>
|
||||||
|
> # Construction from scratch
|
||||||
|
> from spacy.vocab import Vocab
|
||||||
|
> from spacy.language import Language
|
||||||
|
> nlp = Language(Vocab())
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ------------------ | ----------- | ------------------------------------------------------------------------------------------ |
|
||||||
|
| `vocab` | `Vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `max_length` | int | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. |
|
||||||
|
| `meta` | dict | Custom meta data for the `Language` class. Is written to by models to add model meta data. |
|
||||||
|
| `create_tokenizer` | `Callable` | Optional function that receives the `nlp` object and returns a tokenizer. |
|
||||||
|
|
||||||
|
## Language.from_config {#from_config tag="classmethod"}
|
||||||
|
|
||||||
|
Create a `Language` object from a loaded config. Will set up the tokenizer and
|
||||||
|
language data, add pipeline components based on the pipeline and components
|
||||||
|
define in the config and validate the results. If no config is provided, the
|
||||||
|
default config of the given language is used. This is also how spaCy loads a
|
||||||
|
model under the hood based on its [`config.cfg`](/api/data-formats#config).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from thinc.api import Config
|
||||||
|
> from spacy.language import Language
|
||||||
|
>
|
||||||
|
> config = Config().from_disk("./config.cfg")
|
||||||
|
> nlp = Language.from_config(config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `config` | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | The loaded config. |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `disable` | `Iterable[str]` | List of pipeline component names to disable. |
|
||||||
|
| `auto_fill` | bool | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. |
|
||||||
|
| `validate` | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
|
||||||
|
| **RETURNS** | `Language` | The initialized object. |
|
||||||
|
|
||||||
## Language.component {#component tag="classmethod" new="3"}
|
## Language.component {#component tag="classmethod" new="3"}
|
||||||
|
|
||||||
Register a custom pipeline component under a given name. This allows
|
Register a custom pipeline component under a given name. This allows
|
||||||
|
@ -101,57 +153,6 @@ examples, see the
|
||||||
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
||||||
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
||||||
|
|
||||||
## Language.\_\_init\_\_ {#init tag="method"}
|
|
||||||
|
|
||||||
Initialize a `Language` object.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> from spacy.vocab import Vocab
|
|
||||||
> from spacy.language import Language
|
|
||||||
> nlp = Language(Vocab())
|
|
||||||
>
|
|
||||||
> from spacy.lang.en import English
|
|
||||||
> nlp = English()
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
|
||||||
| ------------------ | ----------- | ------------------------------------------------------------------------------------------ |
|
|
||||||
| `vocab` | `Vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. |
|
|
||||||
| _keyword-only_ | | |
|
|
||||||
| `max_length` | int | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. |
|
|
||||||
| `meta` | dict | Custom meta data for the `Language` class. Is written to by models to add model meta data. |
|
|
||||||
| `create_tokenizer` | `Callable` | Optional function that receives the `nlp` object and returns a tokenizer. |
|
|
||||||
| **RETURNS** | `Language` | The newly constructed object. |
|
|
||||||
|
|
||||||
## Language.from_config {#from_config tag="classmethod"}
|
|
||||||
|
|
||||||
Create a `Language` object from a loaded config. Will set up the tokenizer and
|
|
||||||
language data, add pipeline components based on the pipeline and components
|
|
||||||
define in the config and validate the results. If no config is provided, the
|
|
||||||
default config of the given language is used. This is also how spaCy loads a
|
|
||||||
model under the hood based on its [`config.cfg`](/api/data-formats#config).
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> from thinc.api import Config
|
|
||||||
> from spacy.language import Language
|
|
||||||
>
|
|
||||||
> config = Config().from_disk("./config.cfg")
|
|
||||||
> nlp = Language.from_config(config)
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
|
||||||
| -------------- | ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
|
|
||||||
| `config` | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | The loaded config. |
|
|
||||||
| _keyword-only_ | |
|
|
||||||
| `disable` | `Iterable[str]` | List of pipeline component names to disable. |
|
|
||||||
| `auto_fill` | bool | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. |
|
|
||||||
| `validate` | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
|
|
||||||
| **RETURNS** | `Language` | The initialized object. |
|
|
||||||
|
|
||||||
## Language.\_\_call\_\_ {#call tag="method"}
|
## Language.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
Apply the pipeline to some text. The text can span multiple sentences, and can
|
Apply the pipeline to some text. The text can span multiple sentences, and can
|
||||||
|
@ -165,10 +166,12 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----------- | --------------------------------------------------------------------------------- |
|
| --------------- | ----------------- | ------------------------------------------------------------------------------------------------------ |
|
||||||
| `text` | str | The text to be processed. |
|
| `text` | str | The text to be processed. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
||||||
| **RETURNS** | `Doc` | A container for accessing the annotations. |
|
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
|
||||||
|
| **RETURNS** | [`Doc`](/api/doc) | A container for accessing the annotations. |
|
||||||
|
|
||||||
## Language.pipe {#pipe tag="method"}
|
## Language.pipe {#pipe tag="method"}
|
||||||
|
|
||||||
|
@ -184,15 +187,57 @@ more efficient than processing texts one-by-one.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------------------------------------- | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------------ | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `texts` | `Iterable[str]` | A sequence of strings. |
|
| `texts` | `Iterable[str]` | A sequence of strings. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. |
|
| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. |
|
||||||
| `batch_size` | int | The number of texts to buffer. |
|
| `batch_size` | int | The number of texts to buffer. |
|
||||||
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
||||||
| `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. |
|
| `cleanup` | bool | If `True`, unneeded strings are freed to control memory use. Experimental. |
|
||||||
|
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
|
||||||
| `n_process` <Tag variant="new">2.2.2</Tag> | int | Number of processors to use, only supported in Python 3. Defaults to `1`. |
|
| `n_process` <Tag variant="new">2.2.2</Tag> | int | Number of processors to use, only supported in Python 3. Defaults to `1`. |
|
||||||
| **YIELDS** | `Doc` | Documents in the order of the original text. |
|
| **YIELDS** | `Doc` | Documents in the order of the original text. |
|
||||||
|
|
||||||
|
## Language.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
|
Initialize the pipe for training, using data examples if available. Returns an
|
||||||
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> optimizer = nlp.begin_training(get_examples)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. |
|
||||||
|
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
|
||||||
|
## Language.resume_training {#resume_training tag="method,experimental" new="3"}
|
||||||
|
|
||||||
|
Continue training a pretrained model. Create and return an optimizer, and
|
||||||
|
initialize "rehearsal" for any pipeline component that has a `rehearse` method.
|
||||||
|
Rehearsal is used to prevent models from "forgetting" their initialized
|
||||||
|
"knowledge". To perform rehearsal, collect samples of text you want the models
|
||||||
|
to retain performance on, and call [`nlp.rehearse`](/api/language#rehearse) with
|
||||||
|
a batch of [Example](/api/example) objects.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> optimizer = nlp.resume_training()
|
||||||
|
> nlp.rehearse(examples, sgd=optimizer)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. |
|
||||||
|
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
|
||||||
## Language.update {#update tag="method"}
|
## Language.update {#update tag="method"}
|
||||||
|
|
||||||
Update the models in the pipeline.
|
Update the models in the pipeline.
|
||||||
|
@ -207,13 +252,35 @@ Update the models in the pipeline.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------------------------------------- | ------------------- | ---------------------------------------------------------------------------- |
|
| --------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
|
||||||
| `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. |
|
| `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `drop` | float | The dropout rate. |
|
| `drop` | float | The dropout rate. |
|
||||||
| `sgd` | `Optimizer` | An [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
| `losses` | `Dict[str, float]` | Dictionary to update with the loss, keyed by pipeline component. |
|
| `losses` | `Dict[str, float]` | Dictionary to update with the loss, keyed by pipeline component. |
|
||||||
| `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. |
|
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
|
||||||
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
|
## Language.rehearse {#rehearse tag="method,experimental"}
|
||||||
|
|
||||||
|
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
||||||
|
current model to make predictions similar to an initial model, to try to address
|
||||||
|
the "catastrophic forgetting" problem. This feature is experimental.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> optimizer = nlp.resume_training()
|
||||||
|
> losses = nlp.rehearse(examples, sgd=optimizer)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- |
|
||||||
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `drop` | float | The dropout rate. |
|
||||||
|
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||||
|
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||||
|
|
||||||
## Language.evaluate {#evaluate tag="method"}
|
## Language.evaluate {#evaluate tag="method"}
|
||||||
|
@ -228,32 +295,14 @@ Evaluate a model's pipeline components.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------------------------------------- | ------------------------------- | ------------------------------------------------------------------------------------- |
|
| --------------- | ------------------------------- | ------------------------------------------------------------------------------------------------------ |
|
||||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
| `verbose` | bool | Print debugging information. |
|
| `verbose` | bool | Print debugging information. |
|
||||||
| `batch_size` | int | The batch size to use. |
|
| `batch_size` | int | The batch size to use. |
|
||||||
| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. |
|
| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. |
|
||||||
| `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. |
|
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
|
||||||
| **RETURNS** | `Dict[str, Union[float, Dict]]` | A dictionary of evaluation scores. |
|
| **RETURNS** | `Dict[str, Union[float, dict]]` | A dictionary of evaluation scores. |
|
||||||
|
|
||||||
## Language.begin_training {#begin_training tag="method"}
|
|
||||||
|
|
||||||
Allocate models, pre-process training data and acquire an
|
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers).
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> optimizer = nlp.begin_training(get_examples)
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
|
||||||
| -------------------------------------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------ |
|
|
||||||
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
|
||||||
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. If not set, a default one will be created. |
|
|
||||||
| `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. |
|
|
||||||
| `**cfg` | - | Config parameters (sent to all components). |
|
|
||||||
| **RETURNS** | `Optimizer` | An optimizer. |
|
|
||||||
|
|
||||||
## Language.use_params {#use_params tag="contextmanager, method"}
|
## Language.use_params {#use_params tag="contextmanager, method"}
|
||||||
|
|
||||||
|
@ -296,6 +345,7 @@ To create a component and add it to the pipeline, you should always use
|
||||||
| ------------------------------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `factory_name` | str | Name of the registered component factory. |
|
| `factory_name` | str | Name of the registered component factory. |
|
||||||
| `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. |
|
| `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
| `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. |
|
| `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. |
|
||||||
| `validate` <Tag variant="new">3</Tag> | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
|
| `validate` <Tag variant="new">3</Tag> | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
|
||||||
| **RETURNS** | callable | The pipeline component. |
|
| **RETURNS** | callable | The pipeline component. |
|
||||||
|
@ -419,9 +469,12 @@ Replace a component in the pipeline.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | --------------------------------- |
|
| ------------------------------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `name` | str | Name of the component to replace. |
|
| `name` | str | Name of the component to replace. |
|
||||||
| `component` | callable | The pipeline component to insert. |
|
| `component` | callable | The pipeline component to insert. |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` | Optional config parameters to use for the new component. Will be merged with the `default_config` specified by the component factory. |
|
||||||
|
| `validate` <Tag variant="new">3</Tag> | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
|
||||||
|
|
||||||
## Language.rename_pipe {#rename_pipe tag="method" new="2"}
|
## Language.rename_pipe {#rename_pipe tag="method" new="2"}
|
||||||
|
|
||||||
|
@ -493,7 +546,8 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | --------------- | ------------------------------------------------------------------------------------ |
|
| -------------- | --------------- | ------------------------------------------------------------------------------------ |
|
||||||
|
| _keyword-only_ | | |
|
||||||
| `disable` | str / list | Name(s) of pipeline components to disable. |
|
| `disable` | str / list | Name(s) of pipeline components to disable. |
|
||||||
| `enable` | str / list | Names(s) of pipeline components that will not be disabled. |
|
| `enable` | str / list | Names(s) of pipeline components that will not be disabled. |
|
||||||
| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |
|
| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |
|
||||||
|
@ -767,8 +821,8 @@ serialization by passing in the string names via the `exclude` argument.
|
||||||
The `FactoryMeta` contains the information about the component and its default
|
The `FactoryMeta` contains the information about the component and its default
|
||||||
provided by the [`@Language.component`](/api/language#component) or
|
provided by the [`@Language.component`](/api/language#component) or
|
||||||
[`@Language.factory`](/api/language#factory) decorator. It's created whenever a
|
[`@Language.factory`](/api/language#factory) decorator. It's created whenever a
|
||||||
component is added to the pipeline and stored on the `Language` class for each
|
component is defined and stored on the `Language` class for each component
|
||||||
component instance and factory instance.
|
instance and factory instance.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
|
|
@ -31,7 +31,6 @@ when a `Language` subclass and its `Vocab` is initialized.
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
|
| -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. |
|
| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. |
|
||||||
| **RETURNS** | `Lemmatizer` | The newly created object. |
|
|
||||||
|
|
||||||
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
|
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -14,10 +14,9 @@ lemmatization depends on the part-of-speech tag).
|
||||||
Create a `Lexeme` object.
|
Create a `Lexeme` object.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | ----------------------------- |
|
| ------- | ------- | -------------------------- |
|
||||||
| `vocab` | `Vocab` | The parent vocabulary. |
|
| `vocab` | `Vocab` | The parent vocabulary. |
|
||||||
| `orth` | int | The orth id of the lexeme. |
|
| `orth` | int | The orth id of the lexeme. |
|
||||||
| **RETURNS** | `Lexeme` | The newly constructed object. |
|
|
||||||
|
|
||||||
## Lexeme.set_flag {#set_flag tag="method"}
|
## Lexeme.set_flag {#set_flag tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -237,9 +237,8 @@ Initialize a new table.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------- | ---------------------------------- |
|
| ------ | ---- | ---------------------------------- |
|
||||||
| `name` | str | Optional table name for reference. |
|
| `name` | str | Optional table name for reference. |
|
||||||
| **RETURNS** | `Table` | The newly constructed object. |
|
|
||||||
|
|
||||||
### Table.from_dict {#table.from_dict tag="classmethod"}
|
### Table.from_dict {#table.from_dict tag="classmethod"}
|
||||||
|
|
||||||
|
|
|
@ -20,10 +20,9 @@ string where an integer is expected) or unexpected property names.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------------------------------------- | --------- | ------------------------------------------------------------------------------------------- |
|
| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
|
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
|
||||||
| `validate` <Tag variant="new">2.1</Tag> | bool | Validate all patterns added to this matcher. |
|
| `validate` <Tag variant="new">2.1</Tag> | bool | Validate all patterns added to this matcher. |
|
||||||
| **RETURNS** | `Matcher` | The newly constructed object. |
|
|
||||||
|
|
||||||
## Matcher.\_\_call\_\_ {#call tag="method"}
|
## Matcher.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,6 @@ source: spacy/tokens/morphanalysis.pyx
|
||||||
|
|
||||||
Stores a single morphological analysis.
|
Stores a single morphological analysis.
|
||||||
|
|
||||||
|
|
||||||
## MorphAnalysis.\_\_init\_\_ {#init tag="method"}
|
## MorphAnalysis.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
Initialize a MorphAnalysis object from a UD FEATS string or a dictionary of
|
Initialize a MorphAnalysis object from a UD FEATS string or a dictionary of
|
||||||
|
@ -22,11 +21,9 @@ morphological features.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------ | ----------------------------- |
|
| ---------- | ------------------ | --------------------------- |
|
||||||
| `vocab` | `Vocab` | The vocab. |
|
| `vocab` | `Vocab` | The vocab. |
|
||||||
| `features` | `Union[Dict, str]` | The morphological features. |
|
| `features` | `Union[Dict, str]` | The morphological features. |
|
||||||
| **RETURNS** | `MorphAnalysis` | The newly constructed object. |
|
|
||||||
|
|
||||||
|
|
||||||
## MorphAnalysis.\_\_contains\_\_ {#contains tag="method"}
|
## MorphAnalysis.\_\_contains\_\_ {#contains tag="method"}
|
||||||
|
|
||||||
|
@ -44,7 +41,6 @@ Whether a feature/value pair is in the analysis.
|
||||||
| ----------- | ----- | ------------------------------------- |
|
| ----------- | ----- | ------------------------------------- |
|
||||||
| **RETURNS** | `str` | A feature/value pair in the analysis. |
|
| **RETURNS** | `str` | A feature/value pair in the analysis. |
|
||||||
|
|
||||||
|
|
||||||
## MorphAnalysis.\_\_iter\_\_ {#iter tag="method"}
|
## MorphAnalysis.\_\_iter\_\_ {#iter tag="method"}
|
||||||
|
|
||||||
Iterate over the feature/value pairs in the analysis.
|
Iterate over the feature/value pairs in the analysis.
|
||||||
|
@ -61,7 +57,6 @@ Iterate over the feature/value pairs in the analysis.
|
||||||
| ---------- | ----- | ------------------------------------- |
|
| ---------- | ----- | ------------------------------------- |
|
||||||
| **YIELDS** | `str` | A feature/value pair in the analysis. |
|
| **YIELDS** | `str` | A feature/value pair in the analysis. |
|
||||||
|
|
||||||
|
|
||||||
## MorphAnalysis.\_\_len\_\_ {#len tag="method"}
|
## MorphAnalysis.\_\_len\_\_ {#len tag="method"}
|
||||||
|
|
||||||
Returns the number of features in the analysis.
|
Returns the number of features in the analysis.
|
||||||
|
@ -78,7 +73,6 @@ Returns the number of features in the analysis.
|
||||||
| ----------- | ----- | --------------------------------------- |
|
| ----------- | ----- | --------------------------------------- |
|
||||||
| **RETURNS** | `int` | The number of features in the analysis. |
|
| **RETURNS** | `int` | The number of features in the analysis. |
|
||||||
|
|
||||||
|
|
||||||
## MorphAnalysis.\_\_str\_\_ {#str tag="method"}
|
## MorphAnalysis.\_\_str\_\_ {#str tag="method"}
|
||||||
|
|
||||||
Returns the morphological analysis in the UD FEATS string format.
|
Returns the morphological analysis in the UD FEATS string format.
|
||||||
|
@ -92,10 +86,9 @@ Returns the morphological analysis in the UD FEATS string format.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----- | ---------------------------------|
|
| ----------- | ----- | -------------------------------- |
|
||||||
| **RETURNS** | `str` | The analysis in UD FEATS format. |
|
| **RETURNS** | `str` | The analysis in UD FEATS format. |
|
||||||
|
|
||||||
|
|
||||||
## MorphAnalysis.get {#get tag="method"}
|
## MorphAnalysis.get {#get tag="method"}
|
||||||
|
|
||||||
Retrieve values for a feature by field.
|
Retrieve values for a feature by field.
|
||||||
|
@ -109,11 +102,10 @@ Retrieve values for a feature by field.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------ | ----------------------------------- |
|
| ----------- | ------ | ---------------------------------- |
|
||||||
| `field` | `str` | The field to retrieve. |
|
| `field` | `str` | The field to retrieve. |
|
||||||
| **RETURNS** | `list` | A list of the individual features. |
|
| **RETURNS** | `list` | A list of the individual features. |
|
||||||
|
|
||||||
|
|
||||||
## MorphAnalysis.to_dict {#to_dict tag="method"}
|
## MorphAnalysis.to_dict {#to_dict tag="method"}
|
||||||
|
|
||||||
Produce a dict representation of the analysis, in the same format as the tag
|
Produce a dict representation of the analysis, in the same format as the tag
|
||||||
|
@ -128,10 +120,9 @@ map.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------ | -----------------------------------------|
|
| ----------- | ------ | ---------------------------------------- |
|
||||||
| **RETURNS** | `dict` | The dict representation of the analysis. |
|
| **RETURNS** | `dict` | The dict representation of the analysis. |
|
||||||
|
|
||||||
|
|
||||||
## MorphAnalysis.from_id {#from_id tag="classmethod"}
|
## MorphAnalysis.from_id {#from_id tag="classmethod"}
|
||||||
|
|
||||||
Create a morphological analysis from a given hash ID.
|
Create a morphological analysis from a given hash ID.
|
||||||
|
@ -149,5 +140,3 @@ Create a morphological analysis from a given hash ID.
|
||||||
| ------- | ------- | -------------------------------- |
|
| ------- | ------- | -------------------------------- |
|
||||||
| `vocab` | `Vocab` | The vocab. |
|
| `vocab` | `Vocab` | The vocab. |
|
||||||
| `key` | `int` | The hash of the features string. |
|
| `key` | `int` | The hash of the features string. |
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
|
||||||
|
|
||||||
## Morphologizer.begin_training {#begin_training tag="method"}
|
## Morphologizer.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Return an
|
Initialize the pipe for training, using data examples if available. Returns an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
|
|
@ -4,12 +4,11 @@ tag: class
|
||||||
source: spacy/morphology.pyx
|
source: spacy/morphology.pyx
|
||||||
---
|
---
|
||||||
|
|
||||||
Store the possible morphological analyses for a language, and index them
|
Store the possible morphological analyses for a language, and index them by
|
||||||
by hash. To save space on each token, tokens only know the hash of their
|
hash. To save space on each token, tokens only know the hash of their
|
||||||
morphological analysis, so queries of morphological attributes are delegated to
|
morphological analysis, so queries of morphological attributes are delegated to
|
||||||
this class.
|
this class.
|
||||||
|
|
||||||
|
|
||||||
## Morphology.\_\_init\_\_ {#init tag="method"}
|
## Morphology.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
Create a Morphology object using the tag map, lemmatizer and exceptions.
|
Create a Morphology object using the tag map, lemmatizer and exceptions.
|
||||||
|
@ -23,20 +22,17 @@ Create a Morphology object using the tag map, lemmatizer and exceptions.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- |
|
| ------------ | ----------------- | ---------------------------------------------------------------------------------------------------------- |
|
||||||
| `strings` | `StringStore` | The string store. |
|
| `strings` | `StringStore` | The string store. |
|
||||||
| `tag_map` | `Dict[str, Dict]` | The tag map. |
|
| `tag_map` | `Dict[str, Dict]` | The tag map. |
|
||||||
| `lemmatizer` | `Lemmatizer` | The lemmatizer. |
|
| `lemmatizer` | `Lemmatizer` | The lemmatizer. |
|
||||||
| `exc` | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` |
|
| `exc` | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` |
|
||||||
| **RETURNS** | `Morphology` | The newly constructed object. |
|
|
||||||
|
|
||||||
|
|
||||||
## Morphology.add {#add tag="method"}
|
## Morphology.add {#add tag="method"}
|
||||||
|
|
||||||
Insert a morphological analysis in the morphology table, if not already
|
Insert a morphological analysis in the morphology table, if not already present.
|
||||||
present. The morphological analysis may be provided in the UD FEATS format as a
|
The morphological analysis may be provided in the UD FEATS format as a string or
|
||||||
string or in the tag map dictionary format. Returns the hash of the new
|
in the tag map dictionary format. Returns the hash of the new analysis.
|
||||||
analysis.
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -47,10 +43,9 @@ analysis.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------- | --------------------------- |
|
| ---------- | ------------------ | --------------------------- |
|
||||||
| `features` | `Union[Dict, str]` | The morphological features. |
|
| `features` | `Union[Dict, str]` | The morphological features. |
|
||||||
|
|
||||||
|
|
||||||
## Morphology.get {#get tag="method"}
|
## Morphology.get {#get tag="method"}
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -64,32 +59,29 @@ analysis.
|
||||||
Get the FEATS string for the hash of the morphological analysis.
|
Get the FEATS string for the hash of the morphological analysis.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------ | --------------------------------------- |
|
| ------- | ---- | --------------------------------------- |
|
||||||
| `morph` | int | The hash of the morphological analysis. |
|
| `morph` | int | The hash of the morphological analysis. |
|
||||||
|
|
||||||
|
|
||||||
## Morphology.load_tag_map {#load_tag_map tag="method"}
|
## Morphology.load_tag_map {#load_tag_map tag="method"}
|
||||||
|
|
||||||
Replace the current tag map with the provided tag map.
|
Replace the current tag map with the provided tag map.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------------ | ------------ |
|
| --------- | ----------------- | ------------ |
|
||||||
| `tag_map` | `Dict[str, Dict]` | The tag map. |
|
| `tag_map` | `Dict[str, Dict]` | The tag map. |
|
||||||
|
|
||||||
|
|
||||||
## Morphology.load_morph_exceptions {#load_morph_exceptions tag="method"}
|
## Morphology.load_morph_exceptions {#load_morph_exceptions tag="method"}
|
||||||
|
|
||||||
Replace the current morphological exceptions with the provided exceptions.
|
Replace the current morphological exceptions with the provided exceptions.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------- | ------------------ | ----------------------------- |
|
| ------------- | ----------------- | ----------------------------- |
|
||||||
| `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. |
|
| `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. |
|
||||||
|
|
||||||
|
|
||||||
## Morphology.add_special_case {#add_special_case tag="method"}
|
## Morphology.add_special_case {#add_special_case tag="method"}
|
||||||
|
|
||||||
Add a special-case rule to the morphological analyzer. Tokens whose tag and
|
Add a special-case rule to the morphological analyzer. Tokens whose tag and orth
|
||||||
orth match the rule will receive the specified properties.
|
match the rule will receive the specified properties.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -99,26 +91,23 @@ orth match the rule will receive the specified properties.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---- | ---------------------------------------------- |
|
| ---------- | ---- | ---------------------------------------------- |
|
||||||
| `tag_str` | str | The fine-grained tag. |
|
| `tag_str` | str | The fine-grained tag. |
|
||||||
| `orth_str` | str | The token text. |
|
| `orth_str` | str | The token text. |
|
||||||
| `attrs` | dict | The features to assign for this token and tag. |
|
| `attrs` | dict | The features to assign for this token and tag. |
|
||||||
|
|
||||||
|
|
||||||
## Morphology.exc {#exc tag="property"}
|
## Morphology.exc {#exc tag="property"}
|
||||||
|
|
||||||
The current morphological exceptions.
|
The current morphological exceptions.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ---------- | ----- | --------------------------------------------------- |
|
| ---------- | ---- | --------------------------------------------------- |
|
||||||
| **YIELDS** | dict | The current dictionary of morphological exceptions. |
|
| **YIELDS** | dict | The current dictionary of morphological exceptions. |
|
||||||
|
|
||||||
|
|
||||||
## Morphology.lemmatize {#lemmatize tag="method"}
|
## Morphology.lemmatize {#lemmatize tag="method"}
|
||||||
|
|
||||||
TODO
|
TODO
|
||||||
|
|
||||||
|
|
||||||
## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"}
|
## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"}
|
||||||
|
|
||||||
Convert a string FEATS representation to a dictionary of features and values in
|
Convert a string FEATS representation to a dictionary of features and values in
|
||||||
|
@ -133,11 +122,10 @@ the same format as the tag map.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---- | ------------------------------------------------------------- |
|
| ----------- | ---- | ------------------------------------------------------------------ |
|
||||||
| `feats` | str | The morphological features in Universal Dependencies FEATS format. |
|
| `feats` | str | The morphological features in Universal Dependencies FEATS format. |
|
||||||
| **RETURNS** | dict | The morphological features as a dictionary. |
|
| **RETURNS** | dict | The morphological features as a dictionary. |
|
||||||
|
|
||||||
|
|
||||||
## Morphology.dict_to_feats {#dict_to_feats tag="staticmethod"}
|
## Morphology.dict_to_feats {#dict_to_feats tag="staticmethod"}
|
||||||
|
|
||||||
Convert a dictionary of features and values to a string FEATS representation.
|
Convert a dictionary of features and values to a string FEATS representation.
|
||||||
|
@ -155,7 +143,6 @@ Convert a dictionary of features and values to a string FEATS representation.
|
||||||
| `feats_dict` | `Dict[str, Dict]` | The morphological features as a dictionary. |
|
| `feats_dict` | `Dict[str, Dict]` | The morphological features as a dictionary. |
|
||||||
| **RETURNS** | str | The morphological features as in Universal Dependencies FEATS format. |
|
| **RETURNS** | str | The morphological features as in Universal Dependencies FEATS format. |
|
||||||
|
|
||||||
|
|
||||||
## Attributes {#attributes}
|
## Attributes {#attributes}
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
|
|
@ -36,11 +36,10 @@ be shown.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------------------------------------- | --------------- | ------------------------------------------------------------------------------------------- |
|
| --------------------------------------- | --------- | ------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
|
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
|
||||||
| `attr` <Tag variant="new">2.1</Tag> | int / str | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. |
|
| `attr` <Tag variant="new">2.1</Tag> | int / str | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. |
|
||||||
| `validate` <Tag variant="new">2.1</Tag> | bool | Validate patterns added to the matcher. |
|
| `validate` <Tag variant="new">2.1</Tag> | bool | Validate patterns added to the matcher. |
|
||||||
| **RETURNS** | `PhraseMatcher` | The newly constructed object. |
|
|
||||||
|
|
||||||
## PhraseMatcher.\_\_call\_\_ {#call tag="method"}
|
## PhraseMatcher.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -95,7 +95,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
|
||||||
|
|
||||||
## Pipe.begin_training {#begin_training tag="method"}
|
## Pipe.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Return an
|
Initialize the pipe for training, using data examples if available. Returns an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -198,7 +198,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> pipe = nlp.add_pipe("your_custom_pipe")
|
> pipe = nlp.add_pipe("your_custom_pipe")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.resume_training()
|
||||||
> losses = pipe.rehearse(examples, sgd=optimizer)
|
> losses = pipe.rehearse(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
|
@ -29,9 +29,8 @@ Create a new `Scorer`.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. |
|
| `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. |
|
||||||
| **RETURNS** | `Scorer` | The newly created object. |
|
|
||||||
|
|
||||||
## Scorer.score {#score tag="method"}
|
## Scorer.score {#score tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -116,7 +116,7 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
|
||||||
|
|
||||||
## SentenceRecognizer.begin_training {#begin_training tag="method"}
|
## SentenceRecognizer.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Return an
|
Initialize the pipe for training, using data examples if available. Returns an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -201,7 +201,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> senter = nlp.add_pipe("senter")
|
> senter = nlp.add_pipe("senter")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.resume_training()
|
||||||
> losses = senter.rehearse(examples, sgd=optimizer)
|
> losses = senter.rehearse(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
|
@ -19,14 +19,13 @@ Create a Span object from the slice `doc[start : end]`.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- |
|
| -------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- |
|
||||||
| `doc` | `Doc` | The parent document. |
|
| `doc` | `Doc` | The parent document. |
|
||||||
| `start` | int | The index of the first token of the span. |
|
| `start` | int | The index of the first token of the span. |
|
||||||
| `end` | int | The index of the first token after the span. |
|
| `end` | int | The index of the first token after the span. |
|
||||||
| `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. |
|
| `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. |
|
||||||
| `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. |
|
| `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. |
|
||||||
| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. |
|
| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. |
|
||||||
| **RETURNS** | `Span` | The newly constructed object. |
|
|
||||||
|
|
||||||
## Span.\_\_getitem\_\_ {#getitem tag="method"}
|
## Span.\_\_getitem\_\_ {#getitem tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -20,9 +20,8 @@ Create the `StringStore`.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------------- | ------------------------------------------ |
|
| --------- | -------- | ------------------------------------------ |
|
||||||
| `strings` | iterable | A sequence of strings to add to the store. |
|
| `strings` | iterable | A sequence of strings to add to the store. |
|
||||||
| **RETURNS** | `StringStore` | The newly constructed object. |
|
|
||||||
|
|
||||||
## StringStore.\_\_len\_\_ {#len tag="method"}
|
## StringStore.\_\_len\_\_ {#len tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -114,7 +114,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
|
||||||
|
|
||||||
## Tagger.begin_training {#begin_training tag="method"}
|
## Tagger.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Return an
|
Initialize the pipe for training, using data examples if available. Returns an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -199,7 +199,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = nlp.add_pipe("tagger")
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.resume_training()
|
||||||
> losses = tagger.rehearse(examples, sgd=optimizer)
|
> losses = tagger.rehearse(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
|
@ -133,7 +133,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
|
||||||
|
|
||||||
## TextCategorizer.begin_training {#begin_training tag="method"}
|
## TextCategorizer.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Return an
|
Initialize the pipe for training, using data examples if available. Returns an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -218,7 +218,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = nlp.add_pipe("textcat")
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.resume_training()
|
||||||
> losses = textcat.rehearse(examples, sgd=optimizer)
|
> losses = textcat.rehearse(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
|
@ -110,7 +110,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
|
||||||
|
|
||||||
## Tok2Vec.begin_training {#begin_training tag="method"}
|
## Tok2Vec.begin_training {#begin_training tag="method"}
|
||||||
|
|
||||||
Initialize the pipe for training, using data examples if available. Return an
|
Initialize the pipe for training, using data examples if available. Returns an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
|
|
@ -18,11 +18,10 @@ Construct a `Token` object.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------- | ------------------------------------------- |
|
| -------- | ------- | ------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | A storage container for lexical types. |
|
| `vocab` | `Vocab` | A storage container for lexical types. |
|
||||||
| `doc` | `Doc` | The parent document. |
|
| `doc` | `Doc` | The parent document. |
|
||||||
| `offset` | int | The index of the token within the document. |
|
| `offset` | int | The index of the token within the document. |
|
||||||
| **RETURNS** | `Token` | The newly constructed object. |
|
|
||||||
|
|
||||||
## Token.\_\_len\_\_ {#len tag="method"}
|
## Token.\_\_len\_\_ {#len tag="method"}
|
||||||
|
|
||||||
|
@ -394,7 +393,7 @@ The L2 norm of the token's vector representation.
|
||||||
## Attributes {#attributes}
|
## Attributes {#attributes}
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------------------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------------------------------------- | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `doc` | `Doc` | The parent document. |
|
| `doc` | `Doc` | The parent document. |
|
||||||
| `sent` <Tag variant="new">2.0.12</Tag> | `Span` | The sentence span that this token is a part of. |
|
| `sent` <Tag variant="new">2.0.12</Tag> | `Span` | The sentence span that this token is a part of. |
|
||||||
| `text` | str | Verbatim text content. |
|
| `text` | str | Verbatim text content. |
|
||||||
|
|
|
@ -35,7 +35,7 @@ the
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ |
|
| ---------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `vocab` | `Vocab` | A storage container for lexical types. |
|
| `vocab` | `Vocab` | A storage container for lexical types. |
|
||||||
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
|
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
|
||||||
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
|
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
|
||||||
|
@ -43,7 +43,6 @@ the
|
||||||
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
|
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
|
||||||
| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. |
|
| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. |
|
||||||
| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. |
|
| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. |
|
||||||
| **RETURNS** | `Tokenizer` | The newly constructed object. |
|
|
||||||
|
|
||||||
## Tokenizer.\_\_call\_\_ {#call tag="method"}
|
## Tokenizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
107
website/docs/api/transformer.md
Normal file
107
website/docs/api/transformer.md
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
---
|
||||||
|
title: Transformer
|
||||||
|
teaser: Pipeline component for multi-task learning with transformer models
|
||||||
|
tag: class
|
||||||
|
source: github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
|
||||||
|
new: 3
|
||||||
|
api_base_class: /api/pipe
|
||||||
|
api_string_name: transformer
|
||||||
|
---
|
||||||
|
|
||||||
|
> #### Installation
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> $ pip install spacy-transformers
|
||||||
|
> ```
|
||||||
|
|
||||||
|
<Infobox title="Important note" variant="warning">
|
||||||
|
|
||||||
|
This component is available via the extension package
|
||||||
|
[`spacy-transformers`](https://github.com/explosion/spacy-transformers). It
|
||||||
|
exposes the component via entry points, so if you have the package installed,
|
||||||
|
using `factory = "transformer"` in your
|
||||||
|
[training config](/usage/training#config) or `nlp.add_pipe("transformer")` will
|
||||||
|
work out-of-the-box.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
This pipeline component lets you use transformer models in your pipeline. The
|
||||||
|
component assigns the output of the transformer to the Doc's extension
|
||||||
|
attributes. We also calculate an alignment between the word-piece tokens and the
|
||||||
|
spaCy tokenization, so that we can use the last hidden states to set the
|
||||||
|
`Doc.tensor` attribute. When multiple word-piece tokens align to the same spaCy
|
||||||
|
token, the spaCy token receives the sum of their values. To access the values,
|
||||||
|
you can use the custom [`Doc._.trf_data`](#custom-attributes) attribute. For
|
||||||
|
more details, see the [usage documentation](/usage/transformers).
|
||||||
|
|
||||||
|
## Config and implementation {#config}
|
||||||
|
|
||||||
|
The default config is defined by the pipeline component factory and describes
|
||||||
|
how the component should be configured. You can override its settings via the
|
||||||
|
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||||
|
[`config.cfg` for training](/usage/training#config). See the
|
||||||
|
[model architectures](/api/architectures) documentation for details on the
|
||||||
|
architectures and their arguments and hyperparameters.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy_transformers import Transformer, DEFAULT_CONFIG
|
||||||
|
>
|
||||||
|
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Setting | Type | Description | Default |
|
||||||
|
| ------------------- | ------------------------------------------ | ------------------------------- | ------------------------------------------------------------------- |
|
||||||
|
| `max_batch_items` | int | Maximum size of a padded batch. | `4096` |
|
||||||
|
| `annotation_setter` | Callable | <!-- TODO: --> | [`null_annotation_setter`](/api/transformer#null_annotation_setter) |
|
||||||
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransformerModel](/api/architectures#TransformerModel) |
|
||||||
|
|
||||||
|
```python
|
||||||
|
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Transformer.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> # Construction via add_pipe with default model
|
||||||
|
> trf = nlp.add_pipe("transformer")
|
||||||
|
>
|
||||||
|
> # Construction via add_pipe with custom model
|
||||||
|
> config = {"model": {"@architectures": "my_transformer"}}
|
||||||
|
> trf = nlp.add_pipe("transformer", config=config)
|
||||||
|
>
|
||||||
|
> # Construction from class
|
||||||
|
> from spacy_transformers import Transformer
|
||||||
|
> trf = Transformer(nlp.vocab, model)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
|
shortcut for this and instantiate the component using its string name and
|
||||||
|
[`nlp.add_pipe`](/api/language#create_pipe).
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
||||||
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
|
| `annotation_setter` | `Callable` | <!-- TODO: --> |
|
||||||
|
| _keyword-only_ | | |
|
||||||
|
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||||
|
| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. |
|
||||||
|
|
||||||
|
<!-- TODO: document rest -->
|
||||||
|
|
||||||
|
## TransformerData {#transformerdata tag="dataclass"}
|
||||||
|
|
||||||
|
## FullTransformerBatch {#fulltransformerbatch tag="dataclass"}
|
||||||
|
|
||||||
|
## Custom attributes {#custom-attributes}
|
||||||
|
|
||||||
|
The component sets the following
|
||||||
|
[custom extension attributes](/usage/processing-pipeline#custom-components-attributes):
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | ----------------- | -------------- |
|
||||||
|
| `Doc.trf_data` | `TransformerData` | <!-- TODO: --> |
|
|
@ -37,7 +37,6 @@ you can add vectors to later.
|
||||||
| `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. |
|
| `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. |
|
||||||
| `keys` | iterable | A sequence of keys aligned with the data. |
|
| `keys` | iterable | A sequence of keys aligned with the data. |
|
||||||
| `name` | str | A name to identify the vectors table. |
|
| `name` | str | A name to identify the vectors table. |
|
||||||
| **RETURNS** | `Vectors` | The newly created object. |
|
|
||||||
|
|
||||||
## Vectors.\_\_getitem\_\_ {#getitem tag="method"}
|
## Vectors.\_\_getitem\_\_ {#getitem tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,6 @@ Create the vocabulary.
|
||||||
| `lookups_extra` <Tag variant="new">2.3</Tag> | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. |
|
| `lookups_extra` <Tag variant="new">2.3</Tag> | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. |
|
||||||
| `oov_prob` | float | The default OOV probability. Defaults to `-20.0`. |
|
| `oov_prob` | float | The default OOV probability. Defaults to `-20.0`. |
|
||||||
| `vectors_name` <Tag variant="new">2.2</Tag> | str | A name to identify the vectors table. |
|
| `vectors_name` <Tag variant="new">2.2</Tag> | str | A name to identify the vectors table. |
|
||||||
| **RETURNS** | `Vocab` | The newly constructed object. |
|
|
||||||
|
|
||||||
## Vocab.\_\_len\_\_ {#len tag="method"}
|
## Vocab.\_\_len\_\_ {#len tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -3,4 +3,154 @@ title: Transformers
|
||||||
teaser: Using transformer models like BERT in spaCy
|
teaser: Using transformer models like BERT in spaCy
|
||||||
---
|
---
|
||||||
|
|
||||||
TODO: ...
|
spaCy v3.0 lets you use almost **any statistical model** to power your pipeline.
|
||||||
|
You can use models implemented in a variety of frameworks, including TensorFlow,
|
||||||
|
PyTorch and MXNet. To keep things sane, spaCy expects models from these
|
||||||
|
frameworks to be wrapped with a common interface, using our machine learning
|
||||||
|
library [Thinc](https://thinc.ai). A transformer model is just a statistical
|
||||||
|
model, so the
|
||||||
|
[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package
|
||||||
|
actually has very little work to do: we just have to provide a few functions
|
||||||
|
that do the required plumbing. We also provide a pipeline component,
|
||||||
|
[`Transformer`](/api/transformer), that lets you do multi-task learning and lets
|
||||||
|
you save the transformer outputs for later use.
|
||||||
|
|
||||||
|
<Project id="en_core_bert">
|
||||||
|
|
||||||
|
Try out a BERT-based model pipeline using this project template: swap in your
|
||||||
|
data, edit the settings and hyperparameters and train, evaluate, package and
|
||||||
|
visualize your model.
|
||||||
|
|
||||||
|
</Project>
|
||||||
|
|
||||||
|
<!-- TODO: the text below has been copied from the spacy-transformers repo and needs to be updated and adjusted
|
||||||
|
|
||||||
|
### Training usage
|
||||||
|
|
||||||
|
The recommended workflow for training is to use spaCy's
|
||||||
|
[config system](/usage/training#config), usually via the
|
||||||
|
[`spacy train`](/api/cli#train) command. The config system lets you describe a
|
||||||
|
tree of objects by referring to creation functions, including functions you
|
||||||
|
register yourself. Here's a config snippet for the `Transformer` component,
|
||||||
|
along with matching Python code.
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["transformer"]
|
||||||
|
|
||||||
|
[components.transformer]
|
||||||
|
factory = "transformer"
|
||||||
|
extra_annotation_setter = null
|
||||||
|
max_batch_size = 32
|
||||||
|
|
||||||
|
[components.transformer.model]
|
||||||
|
@architectures = "spacy-transformers.TransformerModel.v1"
|
||||||
|
name = "bert-base-cased"
|
||||||
|
tokenizer_config = {"use_fast": true}
|
||||||
|
|
||||||
|
[components.transformer.model.get_spans]
|
||||||
|
@span_getters = "get_doc_spans.v1"
|
||||||
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
from spacy_transformers import Transformer
|
||||||
|
|
||||||
|
trf = Transformer(
|
||||||
|
nlp.vocab,
|
||||||
|
TransformerModel(
|
||||||
|
"bert-base-cased",
|
||||||
|
get_spans=get_doc_spans,
|
||||||
|
tokenizer_config={"use_fast": True},
|
||||||
|
),
|
||||||
|
annotation_setter=null_annotation_setter,
|
||||||
|
max_batch_size=32,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
The `components.transformer` block adds the `transformer` component to the
|
||||||
|
pipeline, and the `components.transformer.model` block describes the creation of
|
||||||
|
a Thinc [`Model`](https://thinc.ai/docs/api-model) object that will be passed
|
||||||
|
into the component. The block names a function registered in the
|
||||||
|
`@architectures` registry. This function will be looked up and called using the
|
||||||
|
provided arguments. You're not limited to just that function --- you can write
|
||||||
|
your own or use someone else's. The only limitation is that it must return an
|
||||||
|
object of type `Model[List[Doc], FullTransformerBatch]`: that is, a Thinc model
|
||||||
|
that takes a list of `Doc` objects, and returns a `FullTransformerBatch` object
|
||||||
|
with the transformer data.
|
||||||
|
|
||||||
|
The same idea applies to task models that power the downstream components. Most
|
||||||
|
of spaCy's built-in model creation functions support a `tok2vec` argument, which
|
||||||
|
should be a Thinc layer of type `Model[List[Doc], List[Floats2d]]`. This is
|
||||||
|
where we'll plug in our transformer model, using the `Tok2VecTransformer` layer,
|
||||||
|
which sneakily delegates to the `Transformer` pipeline component.
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["ner"]
|
||||||
|
|
||||||
|
[components.ner]
|
||||||
|
factory = "ner"
|
||||||
|
|
||||||
|
[nlp.pipeline.ner.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 3
|
||||||
|
hidden_width = 128
|
||||||
|
maxout_pieces = 3
|
||||||
|
use_upper = false
|
||||||
|
|
||||||
|
[nlp.pipeline.ner.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.Tok2VecListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[nlp.pipeline.ner.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
```
|
||||||
|
|
||||||
|
The `Tok2VecListener` layer expects a `pooling` layer, which needs to be of type
|
||||||
|
`Model[Ragged, Floats2d]`. This layer determines how the vector for each spaCy
|
||||||
|
token will be computed from the zero or more source rows the token is aligned
|
||||||
|
against. Here we use the `reduce_mean` layer, which averages the wordpiece rows.
|
||||||
|
We could instead use `reduce_last`, `reduce_max`, or a custom function you write
|
||||||
|
yourself.
|
||||||
|
|
||||||
|
You can have multiple components all listening to the same transformer model,
|
||||||
|
and all passing gradients back to it. By default, all of the gradients will be
|
||||||
|
equally weighted. You can control this with the `grad_factor` setting, which
|
||||||
|
lets you reweight the gradients from the different listeners. For instance,
|
||||||
|
setting `grad_factor = 0` would disable gradients from one of the listeners,
|
||||||
|
while `grad_factor = 2.0` would multiply them by 2. This is similar to having a
|
||||||
|
custom learning rate for each component. Instead of a constant, you can also
|
||||||
|
provide a schedule, allowing you to freeze the shared parameters at the start of
|
||||||
|
training.
|
||||||
|
|
||||||
|
### Runtime usage
|
||||||
|
|
||||||
|
Transformer models can be used as drop-in replacements for other types of neural
|
||||||
|
networks, so your spaCy pipeline can include them in a way that's completely
|
||||||
|
invisible to the user. Users will download, load and use the model in the
|
||||||
|
standard way, like any other spaCy pipeline.
|
||||||
|
|
||||||
|
Instead of using the transformers as subnetworks directly, you can also use them
|
||||||
|
via the [`Transformer`](/api/transformer) pipeline component. This sets the
|
||||||
|
[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
|
||||||
|
which lets you access the transformers outputs at runtime via the
|
||||||
|
`doc._.trf_data` extension attribute. You can also customize how the
|
||||||
|
`Transformer` object sets annotations onto the `Doc`, by customizing the
|
||||||
|
`Transformer.annotation_setter` object. This callback will be called with the
|
||||||
|
raw input and output data for the whole batch, along with the batch of `Doc`
|
||||||
|
objects, allowing you to implement whatever you need.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
nlp = spacy.load("en_core_trf_lg")
|
||||||
|
for doc in nlp.pipe(["some text", "some other text"]):
|
||||||
|
doc._.trf_data.tensors
|
||||||
|
tokvecs = doc._.trf_data.tensors[-1]
|
||||||
|
```
|
||||||
|
|
||||||
|
The `nlp` object in this example is just like any other spaCy pipeline
|
||||||
|
|
||||||
|
-->
|
||||||
|
|
|
@ -32,17 +32,34 @@ with more recent versions of spaCy v2.x, it's **unlikely** that your code relied
|
||||||
on them.
|
on them.
|
||||||
|
|
||||||
| Removed | Replacement |
|
| Removed | Replacement |
|
||||||
| ----------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `Doc.tokens_from_list` | [`Doc.__init__`](/api/doc#init) |
|
| `Doc.tokens_from_list` | [`Doc.__init__`](/api/doc#init) |
|
||||||
| `Doc.merge`, `Span.merge` | [`Doc.retokenize`](/api/doc#retokenize) |
|
| `Doc.merge`, `Span.merge` | [`Doc.retokenize`](/api/doc#retokenize) |
|
||||||
| `Token.string`, `Span.string`, `Span.upper`, `Span.lower` | [`Span.text`](/api/span#attributes), [`Token.text`](/api/token#attributes) |
|
| `Token.string`, `Span.string`, `Span.upper`, `Span.lower` | [`Span.text`](/api/span#attributes), [`Token.text`](/api/token#attributes) |
|
||||||
| `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) |
|
| `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) |
|
||||||
| keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` |
|
| keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` |
|
||||||
| `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` |
|
| `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` |
|
||||||
| `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer), |
|
| `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer) |
|
||||||
|
|
||||||
## Migrating from v2.x {#migrating}
|
## Migrating from v2.x {#migrating}
|
||||||
|
|
||||||
|
### Downloading and loading models {#migrating-downloading-models}
|
||||||
|
|
||||||
|
Model symlinks and shortcuts like `en` are now officially deprecated. There are
|
||||||
|
[many different models](/models) with different capabilities and not just one
|
||||||
|
"English model". In order to download and load a model, you should always use
|
||||||
|
its full name – for instance, `en_core_web_sm`.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- python -m spacy download en
|
||||||
|
+ python -m spacy download en_core_web_sm
|
||||||
|
```
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- nlp = spacy.load("en")
|
||||||
|
+ nlp = spacy.load("en_core_web_sm")
|
||||||
|
```
|
||||||
|
|
||||||
### Custom pipeline components and factories {#migrating-pipeline-components}
|
### Custom pipeline components and factories {#migrating-pipeline-components}
|
||||||
|
|
||||||
Custom pipeline components now have to be registered explicitly using the
|
Custom pipeline components now have to be registered explicitly using the
|
||||||
|
@ -179,6 +196,10 @@ workflows, from data preprocessing to training and packaging your model.
|
||||||
|
|
||||||
<!-- TODO: write -->
|
<!-- TODO: write -->
|
||||||
|
|
||||||
|
#### Training via the Python API {#migrating-training-python}
|
||||||
|
|
||||||
|
<!-- TODO: this should explain the GoldParse -> Example stuff -->
|
||||||
|
|
||||||
#### Packaging models {#migrating-training-packaging}
|
#### Packaging models {#migrating-training-packaging}
|
||||||
|
|
||||||
The [`spacy package`](/api/cli#package) command now automatically builds the
|
The [`spacy package`](/api/cli#package) command now automatically builds the
|
||||||
|
|
|
@ -81,6 +81,7 @@
|
||||||
"items": [
|
"items": [
|
||||||
{ "text": "Tokenizer", "url": "/api/tokenizer" },
|
{ "text": "Tokenizer", "url": "/api/tokenizer" },
|
||||||
{ "text": "Tok2Vec", "url": "/api/tok2vec" },
|
{ "text": "Tok2Vec", "url": "/api/tok2vec" },
|
||||||
|
{ "text": "Transformer", "url": "/api/transformer" },
|
||||||
{ "text": "Lemmatizer", "url": "/api/lemmatizer" },
|
{ "text": "Lemmatizer", "url": "/api/lemmatizer" },
|
||||||
{ "text": "Morphologizer", "url": "/api/morphologizer" },
|
{ "text": "Morphologizer", "url": "/api/morphologizer" },
|
||||||
{ "text": "Tagger", "url": "/api/tagger" },
|
{ "text": "Tagger", "url": "/api/tagger" },
|
||||||
|
|
|
@ -33,11 +33,12 @@ const Link = ({
|
||||||
const isApi = !external && !hidden && !hideIcon && /^\/?api/.test(dest)
|
const isApi = !external && !hidden && !hideIcon && /^\/?api/.test(dest)
|
||||||
const isArch = !external && !hidden && !hideIcon && /^\/?api\/architectures#/.test(dest)
|
const isArch = !external && !hidden && !hideIcon && /^\/?api\/architectures#/.test(dest)
|
||||||
const isSource = external && !hidden && !hideIcon && /(github.com)/.test(dest)
|
const isSource = external && !hidden && !hideIcon && /(github.com)/.test(dest)
|
||||||
const sourceWithText = (isSource || isApi) && isString(children)
|
const withIcon = isApi || isArch || isSource
|
||||||
|
const sourceWithText = withIcon && isString(children)
|
||||||
const linkClassNames = classNames(classes.root, className, {
|
const linkClassNames = classNames(classes.root, className, {
|
||||||
[classes.hidden]: hidden,
|
[classes.hidden]: hidden,
|
||||||
[classes.nowrap]: (isApi || isSource || isArch) && !sourceWithText,
|
[classes.nowrap]: (withIcon && !sourceWithText) || isArch,
|
||||||
[classes.withIcon]: isApi || isSource || isArch,
|
[classes.withIcon]: withIcon,
|
||||||
})
|
})
|
||||||
const Wrapper = ws ? Whitespace : Fragment
|
const Wrapper = ws ? Whitespace : Fragment
|
||||||
const icon = isArch ? 'network' : isApi ? 'docs' : isSource ? 'code' : null
|
const icon = isArch ? 'network' : isApi ? 'docs' : isSource ? 'code' : null
|
||||||
|
|
|
@ -22,6 +22,7 @@ export const headingTextClassName = 'heading-text'
|
||||||
* @returns {string} - URL to the file on GitHub.
|
* @returns {string} - URL to the file on GitHub.
|
||||||
*/
|
*/
|
||||||
export function github(filepath, branch = 'master') {
|
export function github(filepath, branch = 'master') {
|
||||||
|
if (filepath && filepath.startsWith('github.com')) return `https://${filepath}`
|
||||||
const path = filepath ? '/tree/' + (branch || 'master') + '/' + filepath : ''
|
const path = filepath ? '/tree/' + (branch || 'master') + '/' + filepath : ''
|
||||||
return `https://github.com/${repo}${path}`
|
return `https://github.com/${repo}${path}`
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user