mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
Clean up Vocab constructor (#12290)
* Clean up Vocab constructor * Change effective type of `strings` from `Iterable[str]` to `Optional[StringStore]` * Don't automatically add strings to vocab * Change default values to `None` * Remove `**deprecated_kwargs` * Format
This commit is contained in:
parent
520279ff7c
commit
6ae7618418
|
@ -2,7 +2,7 @@ from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overlo
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
class StringStore:
|
class StringStore:
|
||||||
def __init__(self, strings: Optional[Iterable[str]]) -> None: ...
|
def __init__(self, strings: Optional[Iterable[str]] = None) -> None: ...
|
||||||
@overload
|
@overload
|
||||||
def __getitem__(self, string_or_hash: str) -> int: ...
|
def __getitem__(self, string_or_hash: str) -> int: ...
|
||||||
@overload
|
@overload
|
||||||
|
|
|
@ -9,6 +9,7 @@ from spacy.lang.en import English
|
||||||
from spacy.lang.en.syntax_iterators import noun_chunks
|
from spacy.lang.en.syntax_iterators import noun_chunks
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.pipeline import TrainablePipe
|
from spacy.pipeline import TrainablePipe
|
||||||
|
from spacy.strings import StringStore
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.util import SimpleFrozenList, get_arg_names, make_tempdir
|
from spacy.util import SimpleFrozenList, get_arg_names, make_tempdir
|
||||||
|
@ -131,7 +132,7 @@ def test_issue5458():
|
||||||
# Test that the noun chuncker does not generate overlapping spans
|
# Test that the noun chuncker does not generate overlapping spans
|
||||||
# fmt: off
|
# fmt: off
|
||||||
words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
|
words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
|
||||||
vocab = Vocab(strings=words)
|
vocab = Vocab(strings=StringStore(words))
|
||||||
deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
|
deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
|
||||||
pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
|
pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
|
||||||
heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0]
|
heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0]
|
||||||
|
|
|
@ -13,8 +13,11 @@ from spacy.vocab import Vocab
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
||||||
test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
|
test_strings = [
|
||||||
test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
|
(StringStore(), StringStore()),
|
||||||
|
(StringStore(["rats", "are", "cute"]), StringStore(["i", "like", "rats"])),
|
||||||
|
]
|
||||||
|
test_strings_attrs = [(StringStore(["rats", "are", "cute"]), "Hello")]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.issue(599)
|
@pytest.mark.issue(599)
|
||||||
|
@ -81,7 +84,7 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
|
||||||
vocab2 = Vocab(strings=strings2)
|
vocab2 = Vocab(strings=strings2)
|
||||||
vocab1_b = vocab1.to_bytes()
|
vocab1_b = vocab1.to_bytes()
|
||||||
vocab2_b = vocab2.to_bytes()
|
vocab2_b = vocab2.to_bytes()
|
||||||
if strings1 == strings2:
|
if strings1.to_bytes() == strings2.to_bytes():
|
||||||
assert vocab1_b == vocab2_b
|
assert vocab1_b == vocab2_b
|
||||||
else:
|
else:
|
||||||
assert vocab1_b != vocab2_b
|
assert vocab1_b != vocab2_b
|
||||||
|
@ -117,11 +120,12 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
|
||||||
def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
|
def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
|
||||||
vocab1 = Vocab(strings=strings)
|
vocab1 = Vocab(strings=strings)
|
||||||
vocab2 = Vocab()
|
vocab2 = Vocab()
|
||||||
vocab1[strings[0]].norm_ = lex_attr
|
s = next(iter(vocab1.strings))
|
||||||
assert vocab1[strings[0]].norm_ == lex_attr
|
vocab1[s].norm_ = lex_attr
|
||||||
assert vocab2[strings[0]].norm_ != lex_attr
|
assert vocab1[s].norm_ == lex_attr
|
||||||
|
assert vocab2[s].norm_ != lex_attr
|
||||||
vocab2 = vocab2.from_bytes(vocab1.to_bytes())
|
vocab2 = vocab2.from_bytes(vocab1.to_bytes())
|
||||||
assert vocab2[strings[0]].norm_ == lex_attr
|
assert vocab2[s].norm_ == lex_attr
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||||
|
@ -136,14 +140,15 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr):
|
||||||
def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
|
def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
|
||||||
vocab1 = Vocab(strings=strings)
|
vocab1 = Vocab(strings=strings)
|
||||||
vocab2 = Vocab()
|
vocab2 = Vocab()
|
||||||
vocab1[strings[0]].norm_ = lex_attr
|
s = next(iter(vocab1.strings))
|
||||||
assert vocab1[strings[0]].norm_ == lex_attr
|
vocab1[s].norm_ = lex_attr
|
||||||
assert vocab2[strings[0]].norm_ != lex_attr
|
assert vocab1[s].norm_ == lex_attr
|
||||||
|
assert vocab2[s].norm_ != lex_attr
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
file_path = d / "vocab"
|
file_path = d / "vocab"
|
||||||
vocab1.to_disk(file_path)
|
vocab1.to_disk(file_path)
|
||||||
vocab2 = vocab2.from_disk(file_path)
|
vocab2 = vocab2.from_disk(file_path)
|
||||||
assert vocab2[strings[0]].norm_ == lex_attr
|
assert vocab2[s].norm_ == lex_attr
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
||||||
|
|
|
@ -17,7 +17,7 @@ def test_issue361(en_vocab, text1, text2):
|
||||||
|
|
||||||
@pytest.mark.issue(600)
|
@pytest.mark.issue(600)
|
||||||
def test_issue600():
|
def test_issue600():
|
||||||
vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
|
vocab = Vocab()
|
||||||
doc = Doc(vocab, words=["hello"])
|
doc = Doc(vocab, words=["hello"])
|
||||||
doc[0].tag_ = "NN"
|
doc[0].tag_ = "NN"
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ class Vocab:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ...,
|
lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ...,
|
||||||
strings: Optional[Union[List[str], StringStore]] = ...,
|
strings: Optional[StringStore] = ...,
|
||||||
lookups: Optional[Lookups] = ...,
|
lookups: Optional[Lookups] = ...,
|
||||||
oov_prob: float = ...,
|
oov_prob: float = ...,
|
||||||
writing_system: Dict[str, Any] = ...,
|
writing_system: Dict[str, Any] = ...,
|
||||||
|
|
|
@ -49,9 +49,8 @@ cdef class Vocab:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab
|
DOCS: https://spacy.io/api/vocab
|
||||||
"""
|
"""
|
||||||
def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
|
def __init__(self, lex_attr_getters=None, strings=None, lookups=None,
|
||||||
oov_prob=-20., writing_system={}, get_noun_chunks=None,
|
oov_prob=-20., writing_system=None, get_noun_chunks=None):
|
||||||
**deprecated_kwargs):
|
|
||||||
"""Create the vocabulary.
|
"""Create the vocabulary.
|
||||||
|
|
||||||
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
||||||
|
@ -69,16 +68,19 @@ cdef class Vocab:
|
||||||
self.cfg = {'oov_prob': oov_prob}
|
self.cfg = {'oov_prob': oov_prob}
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._by_orth = PreshMap()
|
self._by_orth = PreshMap()
|
||||||
self.strings = StringStore()
|
|
||||||
self.length = 0
|
self.length = 0
|
||||||
if strings:
|
if strings is None:
|
||||||
for string in strings:
|
self.strings = StringStore()
|
||||||
_ = self[string]
|
else:
|
||||||
|
self.strings = strings
|
||||||
self.lex_attr_getters = lex_attr_getters
|
self.lex_attr_getters = lex_attr_getters
|
||||||
self.morphology = Morphology(self.strings)
|
self.morphology = Morphology(self.strings)
|
||||||
self.vectors = Vectors(strings=self.strings)
|
self.vectors = Vectors(strings=self.strings)
|
||||||
self.lookups = lookups
|
self.lookups = lookups
|
||||||
self.writing_system = writing_system
|
if writing_system is None:
|
||||||
|
self.writing_system = {}
|
||||||
|
else:
|
||||||
|
self.writing_system = writing_system
|
||||||
self.get_noun_chunks = get_noun_chunks
|
self.get_noun_chunks = get_noun_chunks
|
||||||
|
|
||||||
property vectors:
|
property vectors:
|
||||||
|
|
|
@ -17,14 +17,15 @@ Create the vocabulary.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
|
> from spacy.strings import StringStore
|
||||||
> from spacy.vocab import Vocab
|
> from spacy.vocab import Vocab
|
||||||
> vocab = Vocab(strings=["hello", "world"])
|
> vocab = Vocab(strings=StringStore(["hello", "world"]))
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~ |
|
| `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~ |
|
||||||
| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ |
|
| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values. ~~Optional[StringStore]~~ |
|
||||||
| `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ |
|
| `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ |
|
||||||
| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ |
|
| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ |
|
||||||
| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ |
|
| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user