Merge branch 'v4' into feature/radicli

This commit is contained in:
Ines Montani 2023-02-08 16:18:48 +01:00
commit 6300184890
14 changed files with 34 additions and 54 deletions

View File

@ -22,7 +22,6 @@ from ._util import import_code, setup_gpu, _handle_renamed_language_codes
prune=Arg("--prune", "-p", help="Optional number of vectors to prune to"), prune=Arg("--prune", "-p", help="Optional number of vectors to prune to"),
truncate=Arg("--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), truncate=Arg("--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
mode=Arg("--mode", "-m", help="Vectors mode: default or floret"), mode=Arg("--mode", "-m", help="Vectors mode: default or floret"),
name=Arg("--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"), verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
jsonl_loc=Arg("--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file"), jsonl_loc=Arg("--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file"),
# fmt: on # fmt: on
@ -34,7 +33,6 @@ def init_vectors_cli(
prune: int = -1, prune: int = -1,
truncate: int = 0, truncate: int = 0,
mode: Literal["default", "floret"] = "default", mode: Literal["default", "floret"] = "default",
name: Optional[str] = None,
verbose: bool = False, verbose: bool = False,
jsonl_loc: Optional[Path] = None, jsonl_loc: Optional[Path] = None,
): ):
@ -56,7 +54,6 @@ def init_vectors_cli(
vectors_loc, vectors_loc,
truncate=truncate, truncate=truncate,
prune=prune, prune=prune,
name=name,
mode=mode, mode=mode,
) )
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")

View File

@ -174,8 +174,7 @@ class Language:
if not isinstance(vocab, Vocab) and vocab is not True: if not isinstance(vocab, Vocab) and vocab is not True:
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab))) raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
if vocab is True: if vocab is True:
vectors_name = meta.get("vectors", {}).get("name") vocab = create_vocab(self.lang, self.Defaults)
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
else: else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang): if (self.lang and vocab.lang) and (self.lang != vocab.lang):
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@ -229,7 +228,6 @@ class Language:
"width": self.vocab.vectors_length, "width": self.vocab.vectors_length,
"vectors": len(self.vocab.vectors), "vectors": len(self.vocab.vectors),
"keys": self.vocab.vectors.n_keys, "keys": self.vocab.vectors.n_keys,
"name": self.vocab.vectors.name,
"mode": self.vocab.vectors.mode, "mode": self.vocab.vectors.mode,
} }
self._meta["labels"] = dict(self.pipe_labels) self._meta["labels"] = dict(self.pipe_labels)
@ -2197,9 +2195,6 @@ class Language:
if path.exists(): if path.exists():
data = srsly.read_json(path) data = srsly.read_json(path)
self.meta.update(data) self.meta.update(data)
# self.meta always overrides meta["vectors"] with the metadata
# from self.vocab.vectors, so set the name directly
self.vocab.vectors.name = data.get("vectors", {}).get("name")
def deserialize_vocab(path: Path) -> None: def deserialize_vocab(path: Path) -> None:
if path.exists(): if path.exists():
@ -2268,9 +2263,6 @@ class Language:
def deserialize_meta(b): def deserialize_meta(b):
data = srsly.json_loads(b) data = srsly.json_loads(b)
self.meta.update(data) self.meta.update(data)
# self.meta always overrides meta["vectors"] with the metadata
# from self.vocab.vectors, so set the name directly
self.vocab.vectors.name = data.get("vectors", {}).get("name")
deserializers: Dict[str, Callable[[bytes], Any]] = {} deserializers: Dict[str, Callable[[bytes], Any]] = {}
deserializers["config.cfg"] = lambda b: self.config.from_bytes( deserializers["config.cfg"] = lambda b: self.config.from_bytes(

View File

@ -181,7 +181,7 @@ def test_issue4042_bug2():
@pytest.mark.issue(4725) @pytest.mark.issue(4725)
def test_issue4725_1(): def test_issue4725_1():
"""Ensure the pickling of the NER goes well""" """Ensure the pickling of the NER goes well"""
vocab = Vocab(vectors_name="test_vocab_add_vector") vocab = Vocab()
nlp = English(vocab=vocab) nlp = English(vocab=vocab)
config = { config = {
"update_with_oracle_cut_size": 111, "update_with_oracle_cut_size": 111,

View File

@ -84,7 +84,7 @@ def test_issue1539():
@pytest.mark.issue(1807) @pytest.mark.issue(1807)
def test_issue1807(): def test_issue1807():
"""Test vocab.set_vector also adds the word to the vocab.""" """Test vocab.set_vector also adds the word to the vocab."""
vocab = Vocab(vectors_name="test_issue1807") vocab = Vocab()
assert "hello" not in vocab assert "hello" not in vocab
vocab.set_vector("hello", numpy.ones((50,), dtype="f")) vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
assert "hello" in vocab assert "hello" in vocab
@ -94,13 +94,12 @@ def test_issue1807():
def test_issue2871(): def test_issue2871():
"""Test that vectors recover the correct key for spaCy reserved words.""" """Test that vectors recover the correct key for spaCy reserved words."""
words = ["dog", "cat", "SUFFIX"] words = ["dog", "cat", "SUFFIX"]
vocab = Vocab(vectors_name="test_issue2871") vocab = Vocab()
vocab.vectors.resize(shape=(3, 10)) vocab.vectors.resize(shape=(3, 10))
vector_data = numpy.zeros((3, 10), dtype="f") vector_data = numpy.zeros((3, 10), dtype="f")
for word in words: for word in words:
_ = vocab[word] # noqa: F841 _ = vocab[word] # noqa: F841
vocab.set_vector(word, vector_data[0]) vocab.set_vector(word, vector_data[0])
vocab.vectors.name = "dummy_vectors"
assert vocab["dog"].rank == 0 assert vocab["dog"].rank == 0
assert vocab["cat"].rank == 1 assert vocab["cat"].rank == 1
assert vocab["SUFFIX"].rank == 2 assert vocab["SUFFIX"].rank == 2
@ -125,7 +124,7 @@ def test_issue4725_2():
# ensures that this runs correctly and doesn't hang or crash because of the global vectors # ensures that this runs correctly and doesn't hang or crash because of the global vectors
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows), # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
# or because of issues with pickling the NER (cf test_issue4725_1) # or because of issues with pickling the NER (cf test_issue4725_1)
vocab = Vocab(vectors_name="test_vocab_add_vector") vocab = Vocab()
data = numpy.ndarray((5, 3), dtype="f") data = numpy.ndarray((5, 3), dtype="f")
data[0] = 1.0 data[0] = 1.0
data[1] = 2.0 data[1] = 2.0
@ -340,7 +339,7 @@ def test_vectors_doc_doc_similarity(vocab, text1, text2):
def test_vocab_add_vector(): def test_vocab_add_vector():
vocab = Vocab(vectors_name="test_vocab_add_vector") vocab = Vocab()
data = OPS.xp.ndarray((5, 3), dtype="f") data = OPS.xp.ndarray((5, 3), dtype="f")
data[0] = 1.0 data[0] = 1.0
data[1] = 2.0 data[1] = 2.0
@ -356,7 +355,7 @@ def test_vocab_add_vector():
def test_vocab_prune_vectors(): def test_vocab_prune_vectors():
vocab = Vocab(vectors_name="test_vocab_prune_vectors") vocab = Vocab()
_ = vocab["cat"] # noqa: F841 _ = vocab["cat"] # noqa: F841
_ = vocab["dog"] # noqa: F841 _ = vocab["dog"] # noqa: F841
_ = vocab["kitten"] # noqa: F841 _ = vocab["kitten"] # noqa: F841
@ -405,7 +404,7 @@ def test_vectors_serialize():
def test_vector_is_oov(): def test_vector_is_oov():
vocab = Vocab(vectors_name="test_vocab_is_oov") vocab = Vocab()
data = OPS.xp.ndarray((5, 3), dtype="f") data = OPS.xp.ndarray((5, 3), dtype="f")
data[0] = 1.0 data[0] = 1.0
data[1] = 2.0 data[1] = 2.0

View File

@ -96,7 +96,6 @@ class Span:
label: Union[int, str] = ..., label: Union[int, str] = ...,
kb_id: Union[int, str] = ..., kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ..., vector: Optional[Floats1d] = ...,
id: Union[int, str] = ...,
alignment_mode: str = ..., alignment_mode: str = ...,
span_id: Union[int, str] = ..., span_id: Union[int, str] = ...,
) -> Span: ... ) -> Span: ...

View File

@ -134,10 +134,8 @@ cdef class Span:
else: else:
return True return True
cdef SpanC* span_c = self.span_c() self_tuple = self._cmp_tuple()
cdef SpanC* other_span_c = other.span_c() other_tuple = other._cmp_tuple()
self_tuple = (span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id, self.id, self.doc)
other_tuple = (other_span_c.start_char, other_span_c.end_char, other_span_c.label, other_span_c.kb_id, other.id, other.doc)
# < # <
if op == 0: if op == 0:
return self_tuple < other_tuple return self_tuple < other_tuple
@ -158,8 +156,20 @@ cdef class Span:
return self_tuple >= other_tuple return self_tuple >= other_tuple
def __hash__(self): def __hash__(self):
return hash(self._cmp_tuple())
def _cmp_tuple(self):
cdef SpanC* span_c = self.span_c() cdef SpanC* span_c = self.span_c()
return hash((self.doc, span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id, span_c.id)) return (
span_c.start_char,
span_c.end_char,
span_c.start,
span_c.end,
span_c.label,
span_c.kb_id,
span_c.id,
self.doc,
)
def __len__(self): def __len__(self):
"""Get the number of tokens in the span. """Get the number of tokens in the span.
@ -656,7 +666,7 @@ cdef class Span:
else: else:
return self.doc[root] return self.doc[root]
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0): def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
"""Create a `Span` object from the slice `span.text[start : end]`. """Create a `Span` object from the slice `span.text[start : end]`.
start (int): The index of the first character of the span. start (int): The index of the first character of the span.
@ -666,7 +676,6 @@ cdef class Span:
kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity. kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span. the span.
id (Union[int, str]): Unused.
alignment_mode (str): How character indices are aligned to token alignment_mode (str): How character indices are aligned to token
boundaries. Options: "strict" (character indices must be aligned boundaries. Options: "strict" (character indices must be aligned
with token boundaries), "contract" (span of all tokens completely with token boundaries), "contract" (span of all tokens completely

View File

@ -202,7 +202,6 @@ def convert_vectors(
*, *,
truncate: int, truncate: int,
prune: int, prune: int,
name: Optional[str] = None,
mode: str = VectorsMode.default, mode: str = VectorsMode.default,
) -> None: ) -> None:
vectors_loc = ensure_path(vectors_loc) vectors_loc = ensure_path(vectors_loc)
@ -241,12 +240,6 @@ def convert_vectors(
strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys
) )
nlp.vocab.deduplicate_vectors() nlp.vocab.deduplicate_vectors()
if name is None:
# TODO: Is this correct? Does this matter?
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
else:
nlp.vocab.vectors.name = name
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
if prune >= 1 and mode != VectorsMode.floret: if prune >= 1 and mode != VectorsMode.floret:
nlp.vocab.prune_vectors(prune) nlp.vocab.prune_vectors(prune)

View File

@ -52,7 +52,6 @@ cdef class Vectors:
DOCS: https://spacy.io/api/vectors DOCS: https://spacy.io/api/vectors
""" """
cdef public object strings cdef public object strings
cdef public object name
cdef readonly object mode cdef readonly object mode
cdef public object data cdef public object data
cdef public object key2row cdef public object key2row
@ -64,14 +63,13 @@ cdef class Vectors:
cdef readonly unicode bow cdef readonly unicode bow
cdef readonly unicode eow cdef readonly unicode eow
def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"): def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
"""Create a new vector store. """Create a new vector store.
strings (StringStore): The string store. strings (StringStore): The string store.
shape (tuple): Size of the table, as (# entries, # columns) shape (tuple): Size of the table, as (# entries, # columns)
data (numpy.ndarray or cupy.ndarray): The vector data. data (numpy.ndarray or cupy.ndarray): The vector data.
keys (iterable): A sequence of keys, aligned with the data. keys (iterable): A sequence of keys, aligned with the data.
name (str): A name to identify the vectors table.
mode (str): Vectors mode: "default" or "floret" (default: "default"). mode (str): Vectors mode: "default" or "floret" (default: "default").
minn (int): The floret char ngram minn (default: 0). minn (int): The floret char ngram minn (default: 0).
maxn (int): The floret char ngram maxn (default: 0). maxn (int): The floret char ngram maxn (default: 0).
@ -85,7 +83,6 @@ cdef class Vectors:
self.strings = strings self.strings = strings
if self.strings is None: if self.strings is None:
self.strings = StringStore() self.strings = StringStore()
self.name = name
if mode not in Mode.values(): if mode not in Mode.values():
raise ValueError( raise ValueError(
Errors.E202.format( Errors.E202.format(

View File

@ -11,7 +11,8 @@ from .vectors import Vectors
from pathlib import Path from pathlib import Path
def create_vocab( def create_vocab(
lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ... lang: Optional[str],
defaults: Any,
) -> Vocab: ... ) -> Vocab: ...
class Vocab: class Vocab:
@ -28,7 +29,6 @@ class Vocab:
strings: Optional[Union[List[str], StringStore]] = ..., strings: Optional[Union[List[str], StringStore]] = ...,
lookups: Optional[Lookups] = ..., lookups: Optional[Lookups] = ...,
oov_prob: float = ..., oov_prob: float = ...,
vectors_name: Optional[str] = ...,
writing_system: Dict[str, Any] = ..., writing_system: Dict[str, Any] = ...,
get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ..., get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ...,
) -> None: ... ) -> None: ...

View File

@ -23,7 +23,7 @@ from .lang.norm_exceptions import BASE_NORMS
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
def create_vocab(lang, defaults, vectors_name=None): def create_vocab(lang, defaults):
# If the spacy-lookups-data package is installed, we pre-populate the lookups # If the spacy-lookups-data package is installed, we pre-populate the lookups
# with lexeme data, if available # with lexeme data, if available
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters} lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
@ -39,7 +39,6 @@ def create_vocab(lang, defaults, vectors_name=None):
lex_attr_getters=lex_attrs, lex_attr_getters=lex_attrs,
writing_system=defaults.writing_system, writing_system=defaults.writing_system,
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"), get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
vectors_name=vectors_name,
) )
@ -51,8 +50,8 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab DOCS: https://spacy.io/api/vocab
""" """
def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None, def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
oov_prob=-20., vectors_name=None, writing_system={}, oov_prob=-20., writing_system={}, get_noun_chunks=None,
get_noun_chunks=None, **deprecated_kwargs): **deprecated_kwargs):
"""Create the vocabulary. """Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to lex_attr_getters (dict): A dictionary mapping attribute IDs to
@ -61,7 +60,6 @@ cdef class Vocab:
vice versa. vice versa.
lookups (Lookups): Container for large lookup tables and dictionaries. lookups (Lookups): Container for large lookup tables and dictionaries.
oov_prob (float): Default OOV probability. oov_prob (float): Default OOV probability.
vectors_name (str): Optional name to identify the vectors table.
get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]): get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]):
A function that yields base noun phrases used for Doc.noun_chunks. A function that yields base noun phrases used for Doc.noun_chunks.
""" """
@ -78,7 +76,7 @@ cdef class Vocab:
_ = self[string] _ = self[string]
self.lex_attr_getters = lex_attr_getters self.lex_attr_getters = lex_attr_getters
self.morphology = Morphology(self.strings) self.morphology = Morphology(self.strings)
self.vectors = Vectors(strings=self.strings, name=vectors_name) self.vectors = Vectors(strings=self.strings)
self.lookups = lookups self.lookups = lookups
self.writing_system = writing_system self.writing_system = writing_system
self.get_noun_chunks = get_noun_chunks self.get_noun_chunks = get_noun_chunks
@ -308,7 +306,7 @@ cdef class Vocab:
for key, row in self.vectors.key2row.items() for key, row in self.vectors.key2row.items()
} }
# replace vectors with deduplicated version # replace vectors with deduplicated version
self.vectors = Vectors(strings=self.strings, data=data, name=self.vectors.name) self.vectors = Vectors(strings=self.strings, data=data)
for key, row in key2row.items(): for key, row in key2row.items():
self.vectors.add(key, row=row) self.vectors.add(key, row=row)
@ -358,7 +356,7 @@ cdef class Vocab:
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64") keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]]) keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]]) toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row], name=self.vectors.name) self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row])
syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size) syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
syn_keys = ops.to_numpy(syn_keys) syn_keys = ops.to_numpy(syn_keys)
remap = {} remap = {}

View File

@ -201,7 +201,7 @@ This functionality was previously available as part of the command `init-model`.
</Infobox> </Infobox>
```bash ```bash
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose] $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--verbose]
``` ```
| Name | Description | | Name | Description |
@ -212,7 +212,6 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ | | `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
| `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ | | `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ |
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ | | `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. | | **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |

View File

@ -193,7 +193,6 @@ the character indices don't map to a valid span.
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
| `id` | Unused. ~~Union[int, str]~~ |
| `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
| `span_id` <Tag variant="new">3.5.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ | | `span_id` <Tag variant="new">3.5.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | | **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |

View File

@ -52,7 +52,6 @@ modified later.
| `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ | | `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
| `data` | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~ | | `data` | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~ |
| `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ | | `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ |
| `name` | A name to identify the vectors table. ~~str~~ |
| `mode` <Tag variant="new">3.2</Tag> | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ | | `mode` <Tag variant="new">3.2</Tag> | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ |
| `minn` <Tag variant="new">3.2</Tag> | The floret char ngram minn (default: `0`). ~~int~~ | | `minn` <Tag variant="new">3.2</Tag> | The floret char ngram minn (default: `0`). ~~int~~ |
| `maxn` <Tag variant="new">3.2</Tag> | The floret char ngram maxn (default: `0`). ~~int~~ | | `maxn` <Tag variant="new">3.2</Tag> | The floret char ngram maxn (default: `0`). ~~int~~ |

View File

@ -27,7 +27,6 @@ Create the vocabulary.
| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ | | `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ |
| `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ | | `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ |
| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ | | `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ |
| `vectors_name` | A name to identify the vectors table. ~~str~~ |
| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ | | `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ |
| `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ | | `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |