mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-28 02:04:07 +03:00
Remove names for vectors (#12243)
* Remove names for vectors Named vectors are basically a carry-over from v2 and aren't used for anything. * Format
This commit is contained in:
parent
5089efa2d0
commit
cf85b81f34
|
@ -21,7 +21,6 @@ def init_vectors_cli(
|
||||||
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
||||||
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||||
mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
|
mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
|
||||||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
@ -44,7 +43,6 @@ def init_vectors_cli(
|
||||||
vectors_loc,
|
vectors_loc,
|
||||||
truncate=truncate,
|
truncate=truncate,
|
||||||
prune=prune,
|
prune=prune,
|
||||||
name=name,
|
|
||||||
mode=mode,
|
mode=mode,
|
||||||
)
|
)
|
||||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||||
|
|
|
@ -174,8 +174,7 @@ class Language:
|
||||||
if not isinstance(vocab, Vocab) and vocab is not True:
|
if not isinstance(vocab, Vocab) and vocab is not True:
|
||||||
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
|
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
|
||||||
if vocab is True:
|
if vocab is True:
|
||||||
vectors_name = meta.get("vectors", {}).get("name")
|
vocab = create_vocab(self.lang, self.Defaults)
|
||||||
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
|
|
||||||
else:
|
else:
|
||||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||||
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
||||||
|
@ -229,7 +228,6 @@ class Language:
|
||||||
"width": self.vocab.vectors_length,
|
"width": self.vocab.vectors_length,
|
||||||
"vectors": len(self.vocab.vectors),
|
"vectors": len(self.vocab.vectors),
|
||||||
"keys": self.vocab.vectors.n_keys,
|
"keys": self.vocab.vectors.n_keys,
|
||||||
"name": self.vocab.vectors.name,
|
|
||||||
"mode": self.vocab.vectors.mode,
|
"mode": self.vocab.vectors.mode,
|
||||||
}
|
}
|
||||||
self._meta["labels"] = dict(self.pipe_labels)
|
self._meta["labels"] = dict(self.pipe_labels)
|
||||||
|
@ -2197,9 +2195,6 @@ class Language:
|
||||||
if path.exists():
|
if path.exists():
|
||||||
data = srsly.read_json(path)
|
data = srsly.read_json(path)
|
||||||
self.meta.update(data)
|
self.meta.update(data)
|
||||||
# self.meta always overrides meta["vectors"] with the metadata
|
|
||||||
# from self.vocab.vectors, so set the name directly
|
|
||||||
self.vocab.vectors.name = data.get("vectors", {}).get("name")
|
|
||||||
|
|
||||||
def deserialize_vocab(path: Path) -> None:
|
def deserialize_vocab(path: Path) -> None:
|
||||||
if path.exists():
|
if path.exists():
|
||||||
|
@ -2268,9 +2263,6 @@ class Language:
|
||||||
def deserialize_meta(b):
|
def deserialize_meta(b):
|
||||||
data = srsly.json_loads(b)
|
data = srsly.json_loads(b)
|
||||||
self.meta.update(data)
|
self.meta.update(data)
|
||||||
# self.meta always overrides meta["vectors"] with the metadata
|
|
||||||
# from self.vocab.vectors, so set the name directly
|
|
||||||
self.vocab.vectors.name = data.get("vectors", {}).get("name")
|
|
||||||
|
|
||||||
deserializers: Dict[str, Callable[[bytes], Any]] = {}
|
deserializers: Dict[str, Callable[[bytes], Any]] = {}
|
||||||
deserializers["config.cfg"] = lambda b: self.config.from_bytes(
|
deserializers["config.cfg"] = lambda b: self.config.from_bytes(
|
||||||
|
|
|
@ -181,7 +181,7 @@ def test_issue4042_bug2():
|
||||||
@pytest.mark.issue(4725)
|
@pytest.mark.issue(4725)
|
||||||
def test_issue4725_1():
|
def test_issue4725_1():
|
||||||
"""Ensure the pickling of the NER goes well"""
|
"""Ensure the pickling of the NER goes well"""
|
||||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
vocab = Vocab()
|
||||||
nlp = English(vocab=vocab)
|
nlp = English(vocab=vocab)
|
||||||
config = {
|
config = {
|
||||||
"update_with_oracle_cut_size": 111,
|
"update_with_oracle_cut_size": 111,
|
||||||
|
|
|
@ -84,7 +84,7 @@ def test_issue1539():
|
||||||
@pytest.mark.issue(1807)
|
@pytest.mark.issue(1807)
|
||||||
def test_issue1807():
|
def test_issue1807():
|
||||||
"""Test vocab.set_vector also adds the word to the vocab."""
|
"""Test vocab.set_vector also adds the word to the vocab."""
|
||||||
vocab = Vocab(vectors_name="test_issue1807")
|
vocab = Vocab()
|
||||||
assert "hello" not in vocab
|
assert "hello" not in vocab
|
||||||
vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
|
vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
|
||||||
assert "hello" in vocab
|
assert "hello" in vocab
|
||||||
|
@ -94,13 +94,12 @@ def test_issue1807():
|
||||||
def test_issue2871():
|
def test_issue2871():
|
||||||
"""Test that vectors recover the correct key for spaCy reserved words."""
|
"""Test that vectors recover the correct key for spaCy reserved words."""
|
||||||
words = ["dog", "cat", "SUFFIX"]
|
words = ["dog", "cat", "SUFFIX"]
|
||||||
vocab = Vocab(vectors_name="test_issue2871")
|
vocab = Vocab()
|
||||||
vocab.vectors.resize(shape=(3, 10))
|
vocab.vectors.resize(shape=(3, 10))
|
||||||
vector_data = numpy.zeros((3, 10), dtype="f")
|
vector_data = numpy.zeros((3, 10), dtype="f")
|
||||||
for word in words:
|
for word in words:
|
||||||
_ = vocab[word] # noqa: F841
|
_ = vocab[word] # noqa: F841
|
||||||
vocab.set_vector(word, vector_data[0])
|
vocab.set_vector(word, vector_data[0])
|
||||||
vocab.vectors.name = "dummy_vectors"
|
|
||||||
assert vocab["dog"].rank == 0
|
assert vocab["dog"].rank == 0
|
||||||
assert vocab["cat"].rank == 1
|
assert vocab["cat"].rank == 1
|
||||||
assert vocab["SUFFIX"].rank == 2
|
assert vocab["SUFFIX"].rank == 2
|
||||||
|
@ -125,7 +124,7 @@ def test_issue4725_2():
|
||||||
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
||||||
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
|
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
|
||||||
# or because of issues with pickling the NER (cf test_issue4725_1)
|
# or because of issues with pickling the NER (cf test_issue4725_1)
|
||||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
vocab = Vocab()
|
||||||
data = numpy.ndarray((5, 3), dtype="f")
|
data = numpy.ndarray((5, 3), dtype="f")
|
||||||
data[0] = 1.0
|
data[0] = 1.0
|
||||||
data[1] = 2.0
|
data[1] = 2.0
|
||||||
|
@ -340,7 +339,7 @@ def test_vectors_doc_doc_similarity(vocab, text1, text2):
|
||||||
|
|
||||||
|
|
||||||
def test_vocab_add_vector():
|
def test_vocab_add_vector():
|
||||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
vocab = Vocab()
|
||||||
data = OPS.xp.ndarray((5, 3), dtype="f")
|
data = OPS.xp.ndarray((5, 3), dtype="f")
|
||||||
data[0] = 1.0
|
data[0] = 1.0
|
||||||
data[1] = 2.0
|
data[1] = 2.0
|
||||||
|
@ -356,7 +355,7 @@ def test_vocab_add_vector():
|
||||||
|
|
||||||
|
|
||||||
def test_vocab_prune_vectors():
|
def test_vocab_prune_vectors():
|
||||||
vocab = Vocab(vectors_name="test_vocab_prune_vectors")
|
vocab = Vocab()
|
||||||
_ = vocab["cat"] # noqa: F841
|
_ = vocab["cat"] # noqa: F841
|
||||||
_ = vocab["dog"] # noqa: F841
|
_ = vocab["dog"] # noqa: F841
|
||||||
_ = vocab["kitten"] # noqa: F841
|
_ = vocab["kitten"] # noqa: F841
|
||||||
|
@ -405,7 +404,7 @@ def test_vectors_serialize():
|
||||||
|
|
||||||
|
|
||||||
def test_vector_is_oov():
|
def test_vector_is_oov():
|
||||||
vocab = Vocab(vectors_name="test_vocab_is_oov")
|
vocab = Vocab()
|
||||||
data = OPS.xp.ndarray((5, 3), dtype="f")
|
data = OPS.xp.ndarray((5, 3), dtype="f")
|
||||||
data[0] = 1.0
|
data[0] = 1.0
|
||||||
data[1] = 2.0
|
data[1] = 2.0
|
||||||
|
|
|
@ -202,7 +202,6 @@ def convert_vectors(
|
||||||
*,
|
*,
|
||||||
truncate: int,
|
truncate: int,
|
||||||
prune: int,
|
prune: int,
|
||||||
name: Optional[str] = None,
|
|
||||||
mode: str = VectorsMode.default,
|
mode: str = VectorsMode.default,
|
||||||
) -> None:
|
) -> None:
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
vectors_loc = ensure_path(vectors_loc)
|
||||||
|
@ -241,12 +240,6 @@ def convert_vectors(
|
||||||
strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys
|
strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys
|
||||||
)
|
)
|
||||||
nlp.vocab.deduplicate_vectors()
|
nlp.vocab.deduplicate_vectors()
|
||||||
if name is None:
|
|
||||||
# TODO: Is this correct? Does this matter?
|
|
||||||
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
|
|
||||||
else:
|
|
||||||
nlp.vocab.vectors.name = name
|
|
||||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
|
||||||
if prune >= 1 and mode != VectorsMode.floret:
|
if prune >= 1 and mode != VectorsMode.floret:
|
||||||
nlp.vocab.prune_vectors(prune)
|
nlp.vocab.prune_vectors(prune)
|
||||||
|
|
||||||
|
|
|
@ -52,7 +52,6 @@ cdef class Vectors:
|
||||||
DOCS: https://spacy.io/api/vectors
|
DOCS: https://spacy.io/api/vectors
|
||||||
"""
|
"""
|
||||||
cdef public object strings
|
cdef public object strings
|
||||||
cdef public object name
|
|
||||||
cdef readonly object mode
|
cdef readonly object mode
|
||||||
cdef public object data
|
cdef public object data
|
||||||
cdef public object key2row
|
cdef public object key2row
|
||||||
|
@ -64,14 +63,13 @@ cdef class Vectors:
|
||||||
cdef readonly unicode bow
|
cdef readonly unicode bow
|
||||||
cdef readonly unicode eow
|
cdef readonly unicode eow
|
||||||
|
|
||||||
def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
|
def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
|
||||||
"""Create a new vector store.
|
"""Create a new vector store.
|
||||||
|
|
||||||
strings (StringStore): The string store.
|
strings (StringStore): The string store.
|
||||||
shape (tuple): Size of the table, as (# entries, # columns)
|
shape (tuple): Size of the table, as (# entries, # columns)
|
||||||
data (numpy.ndarray or cupy.ndarray): The vector data.
|
data (numpy.ndarray or cupy.ndarray): The vector data.
|
||||||
keys (iterable): A sequence of keys, aligned with the data.
|
keys (iterable): A sequence of keys, aligned with the data.
|
||||||
name (str): A name to identify the vectors table.
|
|
||||||
mode (str): Vectors mode: "default" or "floret" (default: "default").
|
mode (str): Vectors mode: "default" or "floret" (default: "default").
|
||||||
minn (int): The floret char ngram minn (default: 0).
|
minn (int): The floret char ngram minn (default: 0).
|
||||||
maxn (int): The floret char ngram maxn (default: 0).
|
maxn (int): The floret char ngram maxn (default: 0).
|
||||||
|
@ -85,7 +83,6 @@ cdef class Vectors:
|
||||||
self.strings = strings
|
self.strings = strings
|
||||||
if self.strings is None:
|
if self.strings is None:
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
self.name = name
|
|
||||||
if mode not in Mode.values():
|
if mode not in Mode.values():
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
Errors.E202.format(
|
Errors.E202.format(
|
||||||
|
|
|
@ -11,7 +11,8 @@ from .vectors import Vectors
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
def create_vocab(
|
def create_vocab(
|
||||||
lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ...
|
lang: Optional[str],
|
||||||
|
defaults: Any,
|
||||||
) -> Vocab: ...
|
) -> Vocab: ...
|
||||||
|
|
||||||
class Vocab:
|
class Vocab:
|
||||||
|
@ -28,7 +29,6 @@ class Vocab:
|
||||||
strings: Optional[Union[List[str], StringStore]] = ...,
|
strings: Optional[Union[List[str], StringStore]] = ...,
|
||||||
lookups: Optional[Lookups] = ...,
|
lookups: Optional[Lookups] = ...,
|
||||||
oov_prob: float = ...,
|
oov_prob: float = ...,
|
||||||
vectors_name: Optional[str] = ...,
|
|
||||||
writing_system: Dict[str, Any] = ...,
|
writing_system: Dict[str, Any] = ...,
|
||||||
get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ...,
|
get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ...,
|
||||||
) -> None: ...
|
) -> None: ...
|
||||||
|
|
|
@ -23,7 +23,7 @@ from .lang.norm_exceptions import BASE_NORMS
|
||||||
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
||||||
|
|
||||||
|
|
||||||
def create_vocab(lang, defaults, vectors_name=None):
|
def create_vocab(lang, defaults):
|
||||||
# If the spacy-lookups-data package is installed, we pre-populate the lookups
|
# If the spacy-lookups-data package is installed, we pre-populate the lookups
|
||||||
# with lexeme data, if available
|
# with lexeme data, if available
|
||||||
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
|
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
|
||||||
|
@ -39,7 +39,6 @@ def create_vocab(lang, defaults, vectors_name=None):
|
||||||
lex_attr_getters=lex_attrs,
|
lex_attr_getters=lex_attrs,
|
||||||
writing_system=defaults.writing_system,
|
writing_system=defaults.writing_system,
|
||||||
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
|
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
|
||||||
vectors_name=vectors_name,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -51,8 +50,8 @@ cdef class Vocab:
|
||||||
DOCS: https://spacy.io/api/vocab
|
DOCS: https://spacy.io/api/vocab
|
||||||
"""
|
"""
|
||||||
def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
|
def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
|
||||||
oov_prob=-20., vectors_name=None, writing_system={},
|
oov_prob=-20., writing_system={}, get_noun_chunks=None,
|
||||||
get_noun_chunks=None, **deprecated_kwargs):
|
**deprecated_kwargs):
|
||||||
"""Create the vocabulary.
|
"""Create the vocabulary.
|
||||||
|
|
||||||
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
||||||
|
@ -61,7 +60,6 @@ cdef class Vocab:
|
||||||
vice versa.
|
vice versa.
|
||||||
lookups (Lookups): Container for large lookup tables and dictionaries.
|
lookups (Lookups): Container for large lookup tables and dictionaries.
|
||||||
oov_prob (float): Default OOV probability.
|
oov_prob (float): Default OOV probability.
|
||||||
vectors_name (str): Optional name to identify the vectors table.
|
|
||||||
get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]):
|
get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]):
|
||||||
A function that yields base noun phrases used for Doc.noun_chunks.
|
A function that yields base noun phrases used for Doc.noun_chunks.
|
||||||
"""
|
"""
|
||||||
|
@ -78,7 +76,7 @@ cdef class Vocab:
|
||||||
_ = self[string]
|
_ = self[string]
|
||||||
self.lex_attr_getters = lex_attr_getters
|
self.lex_attr_getters = lex_attr_getters
|
||||||
self.morphology = Morphology(self.strings)
|
self.morphology = Morphology(self.strings)
|
||||||
self.vectors = Vectors(strings=self.strings, name=vectors_name)
|
self.vectors = Vectors(strings=self.strings)
|
||||||
self.lookups = lookups
|
self.lookups = lookups
|
||||||
self.writing_system = writing_system
|
self.writing_system = writing_system
|
||||||
self.get_noun_chunks = get_noun_chunks
|
self.get_noun_chunks = get_noun_chunks
|
||||||
|
@ -308,7 +306,7 @@ cdef class Vocab:
|
||||||
for key, row in self.vectors.key2row.items()
|
for key, row in self.vectors.key2row.items()
|
||||||
}
|
}
|
||||||
# replace vectors with deduplicated version
|
# replace vectors with deduplicated version
|
||||||
self.vectors = Vectors(strings=self.strings, data=data, name=self.vectors.name)
|
self.vectors = Vectors(strings=self.strings, data=data)
|
||||||
for key, row in key2row.items():
|
for key, row in key2row.items():
|
||||||
self.vectors.add(key, row=row)
|
self.vectors.add(key, row=row)
|
||||||
|
|
||||||
|
@ -358,7 +356,7 @@ cdef class Vocab:
|
||||||
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
|
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
|
||||||
keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
|
keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
|
||||||
toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
|
toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
|
||||||
self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row], name=self.vectors.name)
|
self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row])
|
||||||
syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
|
syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
|
||||||
syn_keys = ops.to_numpy(syn_keys)
|
syn_keys = ops.to_numpy(syn_keys)
|
||||||
remap = {}
|
remap = {}
|
||||||
|
|
|
@ -201,7 +201,7 @@ This functionality was previously available as part of the command `init-model`.
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose]
|
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--verbose]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -212,7 +212,6 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
|
||||||
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||||
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
||||||
| `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ |
|
| `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ |
|
||||||
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
|
|
||||||
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
|
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
|
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
|
||||||
|
|
|
@ -52,7 +52,6 @@ modified later.
|
||||||
| `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
|
| `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
|
||||||
| `data` | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~ |
|
| `data` | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~ |
|
||||||
| `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ |
|
| `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ |
|
||||||
| `name` | A name to identify the vectors table. ~~str~~ |
|
|
||||||
| `mode` <Tag variant="new">3.2</Tag> | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ |
|
| `mode` <Tag variant="new">3.2</Tag> | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ |
|
||||||
| `minn` <Tag variant="new">3.2</Tag> | The floret char ngram minn (default: `0`). ~~int~~ |
|
| `minn` <Tag variant="new">3.2</Tag> | The floret char ngram minn (default: `0`). ~~int~~ |
|
||||||
| `maxn` <Tag variant="new">3.2</Tag> | The floret char ngram maxn (default: `0`). ~~int~~ |
|
| `maxn` <Tag variant="new">3.2</Tag> | The floret char ngram maxn (default: `0`). ~~int~~ |
|
||||||
|
|
|
@ -27,7 +27,6 @@ Create the vocabulary.
|
||||||
| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ |
|
| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ |
|
||||||
| `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ |
|
| `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ |
|
||||||
| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ |
|
| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ |
|
||||||
| `vectors_name` | A name to identify the vectors table. ~~str~~ |
|
|
||||||
| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ |
|
| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ |
|
||||||
| `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
|
| `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user