From 5089efa2d0ca304d160337f0c7688e745e27e333 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 8 Feb 2023 14:28:34 +0100 Subject: [PATCH 1/3] Use the same tuple in Span cmp and hash (#12251) --- spacy/tokens/span.pyx | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index aefea4f71..134849cdb 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -134,10 +134,8 @@ cdef class Span: else: return True - cdef SpanC* span_c = self.span_c() - cdef SpanC* other_span_c = other.span_c() - self_tuple = (span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id, self.id, self.doc) - other_tuple = (other_span_c.start_char, other_span_c.end_char, other_span_c.label, other_span_c.kb_id, other.id, other.doc) + self_tuple = self._cmp_tuple() + other_tuple = other._cmp_tuple() # < if op == 0: return self_tuple < other_tuple @@ -158,8 +156,20 @@ cdef class Span: return self_tuple >= other_tuple def __hash__(self): + return hash(self._cmp_tuple()) + + def _cmp_tuple(self): cdef SpanC* span_c = self.span_c() - return hash((self.doc, span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id, span_c.id)) + return ( + span_c.start_char, + span_c.end_char, + span_c.start, + span_c.end, + span_c.label, + span_c.kb_id, + span_c.id, + self.doc, + ) def __len__(self): """Get the number of tokens in the span. From cf85b81f346a2b64a1f644c5f72603494bdccbdb Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 8 Feb 2023 14:37:42 +0100 Subject: [PATCH 2/3] Remove names for vectors (#12243) * Remove names for vectors Named vectors are basically a carry-over from v2 and aren't used for anything. * Format --- spacy/cli/init_pipeline.py | 2 -- spacy/language.py | 10 +--------- spacy/tests/serialize/test_serialize_pipeline.py | 2 +- spacy/tests/vocab_vectors/test_vectors.py | 13 ++++++------- spacy/training/initialize.py | 7 ------- spacy/vectors.pyx | 5 +---- spacy/vocab.pyi | 4 ++-- spacy/vocab.pyx | 14 ++++++-------- website/docs/api/cli.mdx | 3 +-- website/docs/api/vectors.mdx | 1 - website/docs/api/vocab.mdx | 1 - 11 files changed, 18 insertions(+), 44 deletions(-) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index f279cf793..5d5c14957 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -21,7 +21,6 @@ def init_vectors_cli( prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"), - name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True), # fmt: on @@ -44,7 +43,6 @@ def init_vectors_cli( vectors_loc, truncate=truncate, prune=prune, - name=name, mode=mode, ) msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") diff --git a/spacy/language.py b/spacy/language.py index fb86689bc..13a3d101a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -174,8 +174,7 @@ class Language: if not isinstance(vocab, Vocab) and vocab is not True: raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab))) if vocab is True: - vectors_name = meta.get("vectors", {}).get("name") - vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name) + vocab = create_vocab(self.lang, self.Defaults) else: if (self.lang and vocab.lang) and (self.lang != vocab.lang): raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) @@ -229,7 +228,6 @@ class Language: "width": self.vocab.vectors_length, "vectors": len(self.vocab.vectors), "keys": self.vocab.vectors.n_keys, - "name": self.vocab.vectors.name, "mode": self.vocab.vectors.mode, } self._meta["labels"] = dict(self.pipe_labels) @@ -2197,9 +2195,6 @@ class Language: if path.exists(): data = srsly.read_json(path) self.meta.update(data) - # self.meta always overrides meta["vectors"] with the metadata - # from self.vocab.vectors, so set the name directly - self.vocab.vectors.name = data.get("vectors", {}).get("name") def deserialize_vocab(path: Path) -> None: if path.exists(): @@ -2268,9 +2263,6 @@ class Language: def deserialize_meta(b): data = srsly.json_loads(b) self.meta.update(data) - # self.meta always overrides meta["vectors"] with the metadata - # from self.vocab.vectors, so set the name directly - self.vocab.vectors.name = data.get("vectors", {}).get("name") deserializers: Dict[str, Callable[[bytes], Any]] = {} deserializers["config.cfg"] = lambda b: self.config.from_bytes( diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 36129a408..4720bc4da 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -181,7 +181,7 @@ def test_issue4042_bug2(): @pytest.mark.issue(4725) def test_issue4725_1(): """Ensure the pickling of the NER goes well""" - vocab = Vocab(vectors_name="test_vocab_add_vector") + vocab = Vocab() nlp = English(vocab=vocab) config = { "update_with_oracle_cut_size": 111, diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 70835816d..ed1322908 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -84,7 +84,7 @@ def test_issue1539(): @pytest.mark.issue(1807) def test_issue1807(): """Test vocab.set_vector also adds the word to the vocab.""" - vocab = Vocab(vectors_name="test_issue1807") + vocab = Vocab() assert "hello" not in vocab vocab.set_vector("hello", numpy.ones((50,), dtype="f")) assert "hello" in vocab @@ -94,13 +94,12 @@ def test_issue1807(): def test_issue2871(): """Test that vectors recover the correct key for spaCy reserved words.""" words = ["dog", "cat", "SUFFIX"] - vocab = Vocab(vectors_name="test_issue2871") + vocab = Vocab() vocab.vectors.resize(shape=(3, 10)) vector_data = numpy.zeros((3, 10), dtype="f") for word in words: _ = vocab[word] # noqa: F841 vocab.set_vector(word, vector_data[0]) - vocab.vectors.name = "dummy_vectors" assert vocab["dog"].rank == 0 assert vocab["cat"].rank == 1 assert vocab["SUFFIX"].rank == 2 @@ -125,7 +124,7 @@ def test_issue4725_2(): # ensures that this runs correctly and doesn't hang or crash because of the global vectors # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows), # or because of issues with pickling the NER (cf test_issue4725_1) - vocab = Vocab(vectors_name="test_vocab_add_vector") + vocab = Vocab() data = numpy.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 @@ -340,7 +339,7 @@ def test_vectors_doc_doc_similarity(vocab, text1, text2): def test_vocab_add_vector(): - vocab = Vocab(vectors_name="test_vocab_add_vector") + vocab = Vocab() data = OPS.xp.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 @@ -356,7 +355,7 @@ def test_vocab_add_vector(): def test_vocab_prune_vectors(): - vocab = Vocab(vectors_name="test_vocab_prune_vectors") + vocab = Vocab() _ = vocab["cat"] # noqa: F841 _ = vocab["dog"] # noqa: F841 _ = vocab["kitten"] # noqa: F841 @@ -405,7 +404,7 @@ def test_vectors_serialize(): def test_vector_is_oov(): - vocab = Vocab(vectors_name="test_vocab_is_oov") + vocab = Vocab() data = OPS.xp.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 6304e4a84..408acdbee 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -202,7 +202,6 @@ def convert_vectors( *, truncate: int, prune: int, - name: Optional[str] = None, mode: str = VectorsMode.default, ) -> None: vectors_loc = ensure_path(vectors_loc) @@ -241,12 +240,6 @@ def convert_vectors( strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys ) nlp.vocab.deduplicate_vectors() - if name is None: - # TODO: Is this correct? Does this matter? - nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors" - else: - nlp.vocab.vectors.name = name - nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name if prune >= 1 and mode != VectorsMode.floret: nlp.vocab.prune_vectors(prune) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index be0f6db09..bec3ac276 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -52,7 +52,6 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors """ cdef public object strings - cdef public object name cdef readonly object mode cdef public object data cdef public object key2row @@ -64,14 +63,13 @@ cdef class Vectors: cdef readonly unicode bow cdef readonly unicode eow - def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"): + def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"): """Create a new vector store. strings (StringStore): The string store. shape (tuple): Size of the table, as (# entries, # columns) data (numpy.ndarray or cupy.ndarray): The vector data. keys (iterable): A sequence of keys, aligned with the data. - name (str): A name to identify the vectors table. mode (str): Vectors mode: "default" or "floret" (default: "default"). minn (int): The floret char ngram minn (default: 0). maxn (int): The floret char ngram maxn (default: 0). @@ -85,7 +83,6 @@ cdef class Vectors: self.strings = strings if self.strings is None: self.strings = StringStore() - self.name = name if mode not in Mode.values(): raise ValueError( Errors.E202.format( diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi index 41964703b..871044fff 100644 --- a/spacy/vocab.pyi +++ b/spacy/vocab.pyi @@ -11,7 +11,8 @@ from .vectors import Vectors from pathlib import Path def create_vocab( - lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ... + lang: Optional[str], + defaults: Any, ) -> Vocab: ... class Vocab: @@ -28,7 +29,6 @@ class Vocab: strings: Optional[Union[List[str], StringStore]] = ..., lookups: Optional[Lookups] = ..., oov_prob: float = ..., - vectors_name: Optional[str] = ..., writing_system: Dict[str, Any] = ..., get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ..., ) -> None: ... diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index a87f50ad4..f3c3595ef 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -23,7 +23,7 @@ from .lang.norm_exceptions import BASE_NORMS from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang -def create_vocab(lang, defaults, vectors_name=None): +def create_vocab(lang, defaults): # If the spacy-lookups-data package is installed, we pre-populate the lookups # with lexeme data, if available lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters} @@ -39,7 +39,6 @@ def create_vocab(lang, defaults, vectors_name=None): lex_attr_getters=lex_attrs, writing_system=defaults.writing_system, get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"), - vectors_name=vectors_name, ) @@ -51,8 +50,8 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab """ def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None, - oov_prob=-20., vectors_name=None, writing_system={}, - get_noun_chunks=None, **deprecated_kwargs): + oov_prob=-20., writing_system={}, get_noun_chunks=None, + **deprecated_kwargs): """Create the vocabulary. lex_attr_getters (dict): A dictionary mapping attribute IDs to @@ -61,7 +60,6 @@ cdef class Vocab: vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. oov_prob (float): Default OOV probability. - vectors_name (str): Optional name to identify the vectors table. get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]): A function that yields base noun phrases used for Doc.noun_chunks. """ @@ -78,7 +76,7 @@ cdef class Vocab: _ = self[string] self.lex_attr_getters = lex_attr_getters self.morphology = Morphology(self.strings) - self.vectors = Vectors(strings=self.strings, name=vectors_name) + self.vectors = Vectors(strings=self.strings) self.lookups = lookups self.writing_system = writing_system self.get_noun_chunks = get_noun_chunks @@ -308,7 +306,7 @@ cdef class Vocab: for key, row in self.vectors.key2row.items() } # replace vectors with deduplicated version - self.vectors = Vectors(strings=self.strings, data=data, name=self.vectors.name) + self.vectors = Vectors(strings=self.strings, data=data) for key, row in key2row.items(): self.vectors.add(key, row=row) @@ -358,7 +356,7 @@ cdef class Vocab: keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64") keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]]) toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]]) - self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row], name=self.vectors.name) + self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row]) syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size) syn_keys = ops.to_numpy(syn_keys) remap = {} diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index b1c28c764..868079e8c 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -201,7 +201,7 @@ This functionality was previously available as part of the command `init-model`. ```bash -$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose] +$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--verbose] ``` | Name | Description | @@ -212,7 +212,6 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | | `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ | | `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ | -| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ | | `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **CREATES** | A spaCy pipeline directory containing the vocab and vectors. | diff --git a/website/docs/api/vectors.mdx b/website/docs/api/vectors.mdx index d6033c096..021484a1b 100644 --- a/website/docs/api/vectors.mdx +++ b/website/docs/api/vectors.mdx @@ -52,7 +52,6 @@ modified later. | `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ | | `data` | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~ | | `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ | -| `name` | A name to identify the vectors table. ~~str~~ | | `mode` 3.2 | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ | | `minn` 3.2 | The floret char ngram minn (default: `0`). ~~int~~ | | `maxn` 3.2 | The floret char ngram maxn (default: `0`). ~~int~~ | diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx index 131e4ce0a..3faf1f1a0 100644 --- a/website/docs/api/vocab.mdx +++ b/website/docs/api/vocab.mdx @@ -27,7 +27,6 @@ Create the vocabulary. | `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ | | `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ | | `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ | -| `vectors_name` | A name to identify the vectors table. ~~str~~ | | `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ | | `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ | From cbc2ae933ee84c433d92ac3845e9520f761ee3c8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 8 Feb 2023 14:46:07 +0100 Subject: [PATCH 3/3] Remove unused Span.char_span(id=) (#12250) --- spacy/tokens/span.pyi | 1 - spacy/tokens/span.pyx | 3 +-- website/docs/api/span.mdx | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index 979e74e7e..549990c5e 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -96,7 +96,6 @@ class Span: label: Union[int, str] = ..., kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., - id: Union[int, str] = ..., alignment_mode: str = ..., span_id: Union[int, str] = ..., ) -> Span: ... diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 134849cdb..4990cb5f7 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -666,7 +666,7 @@ cdef class Span: else: return self.doc[root] - def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0): + def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0): """Create a `Span` object from the slice `span.text[start : end]`. start (int): The index of the first character of the span. @@ -676,7 +676,6 @@ cdef class Span: kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. - id (Union[int, str]): Unused. alignment_mode (str): How character indices are aligned to token boundaries. Options: "strict" (character indices must be aligned with token boundaries), "contract" (span of all tokens completely diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx index e62d9c724..7e7042866 100644 --- a/website/docs/api/span.mdx +++ b/website/docs/api/span.mdx @@ -193,7 +193,6 @@ the character indices don't map to a valid span. | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | -| `id` | Unused. ~~Union[int, str]~~ | | `alignment_mode` 3.5.1 | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | `span_id` 3.5.1 | An identifier to associate with the span. ~~Union[int, str]~~ | | **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |