💫 Adjust Table API and add docs (#4289)

* Adjust Table API and add docs * Add attributes and update description [ci skip] * Use strings.get_string_id instead of hash_string * Fix table method calls * Make orth arg in Lemmatizer.lookup optional Fall back to string, which is now handled by Table.__contains__ out-of-the-box * Fix method name * Auto-format
2025-10-27 22:21:08 +03:00 · 2019-09-15 22:08:13 +02:00 · 2019-09-15 22:08:13 +02:00 · bab9976d9a
commit bab9976d9a
parent 88a9d87f6f
14 changed files with 215 additions and 104 deletions
--- a/spacy/lang/el/lemmatizer/init.py
+++ b/spacy/lang/el/lemmatizer/init.py
@ -46,9 +46,10 @@ class GreekLemmatizer(object):
        )
        return lemmas

-    def lookup(self, orth, string):
-        if orth in self.lookup_table:
-            return self.lookup_table[orth]
+    def lookup(self, string, orth=None):
+        key = orth if orth is not None else string
+        if key in self.lookup_table:
+            return self.lookup_table[key]
        return string


--- a/spacy/lang/fr/lemmatizer/init.py
+++ b/spacy/lang/fr/lemmatizer/init.py
@ -52,7 +52,7 @@ class FrenchLemmatizer(object):
        elif univ_pos in (SCONJ, "SCONJ", "sconj"):
            univ_pos = "sconj"
        else:
-            return [self.lookup(None, string)]
+            return [self.lookup(string)]
        # See Issue #435 for example of where this logic is requied.
        if self.is_base_form(univ_pos, morphology):
            return list(set([string.lower()]))
@ -114,7 +114,7 @@ class FrenchLemmatizer(object):
    def punct(self, string, morphology=None):
        return self(string, "punct", morphology)

-    def lookup(self, orth, string):
+    def lookup(self, string, orth=None):
        if orth is not None and orth in self.lookup_table:
            return self.lookup_table[orth][0]
        return string
--- a/spacy/lang/nl/lemmatizer/init.py
+++ b/spacy/lang/nl/lemmatizer/init.py
@ -62,11 +62,11 @@ class DutchLemmatizer(object):
            # are not lemmatized. They are lowercased, however.
            return [string]
            # if string in self.lemma_index.get(univ_pos)
-        lemma_index = self.index.get_string(univ_pos, {})
+        lemma_index = self.index.get(univ_pos, {})
        # string is already lemma
        if string in lemma_index:
            return [string]
-        exceptions = self.exc.get_string(univ_pos, {})
+        exceptions = self.exc.get(univ_pos, {})
        # string is irregular token contained in exceptions index.
        try:
            lemma = exceptions[string]
@ -75,12 +75,12 @@ class DutchLemmatizer(object):
            pass
        # string corresponds to key in lookup table
        lookup_table = self.lookup_table
-        looked_up_lemma = lookup_table.get_string(string)
+        looked_up_lemma = lookup_table.get(string)
        if looked_up_lemma and looked_up_lemma in lemma_index:
            return [looked_up_lemma]

        forms, is_known = lemmatize(
-            string, lemma_index, exceptions, self.rules.get_string(univ_pos, [])
+            string, lemma_index, exceptions, self.rules.get(univ_pos, [])
        )

        # Back-off through remaining return value candidates.
@ -103,12 +103,12 @@ class DutchLemmatizer(object):
    # Overrides parent method so that a lowercased version of the string is
    # used to search the lookup table. This is necessary because our lookup
    # table consists entirely of lowercase keys.
-    def lookup(self, orth, string):
+    def lookup(self, string, orth=None):
        string = string.lower()
        if orth is not None:
            return self.lookup_table.get(orth, string)
        else:
-            return self.lookup_table.get_string(string, string)
+            return self.lookup_table.get(string, string)

    def noun(self, string, morphology=None):
        return self(string, "noun", morphology)
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@ -115,7 +115,7 @@ class RussianLemmatizer(Lemmatizer):
    def pron(self, string, morphology=None):
        return self(string, "pron", morphology)

-    def lookup(self, orth, string):
+    def lookup(self, string, orth=None):
        analyses = self._morph.parse(string)
        if len(analyses) == 1:
            return analyses[0].normal_form
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@ -112,7 +112,7 @@ class UkrainianLemmatizer(Lemmatizer):
    def pron(self, string, morphology=None):
        return self(string, "pron", morphology)

-    def lookup(self, orth, string):
+    def lookup(self, string, orth=None):
        analyses = self._morph.parse(string)
        if len(analyses) == 1:
            return analyses[0].normal_form
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -93,9 +93,19 @@ class Lemmatizer(object):
    def punct(self, string, morphology=None):
        return self(string, "punct", morphology)

-    def lookup(self, orth, string):
-        if orth in self.lookup_table:
-            return self.lookup_table[orth]
+    def lookup(self, string, orth=None):
+        """Look up a lemma in the table, if available. If no lemma is found,
+        the original string is returned.
+
+        string (unicode): The original string.
+        orth (int): Optional hash of the string to look up. If not set, the
+            string will be used and hashed.
+        RETURNS (unicode): The lemma if the string was found, otherwise the
+            original string.
+        """
+        key = orth if orth is not None else string
+        if key in self.lookup_table:
+            return self.lookup_table[key]
        return string


--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@ -3,15 +3,19 @@ from __future__ import unicode_literals

 import srsly
 from collections import OrderedDict
+from preshed.bloom import BloomFilter

 from .errors import Errors
 from .util import SimpleFrozenDict, ensure_path
-from .strings import hash_string
+from .compat import basestring_
+from .strings import get_string_id

-from . import util

-import srsly
-from preshed.bloom import BloomFilter
+def ensure_hash(key):
+    if isinstance(key, basestring_):
+        return get_string_id(key)
+    return key
+

 class Lookups(object):
    """Container for large lookup tables and dictionaries, e.g. lemmatization
@ -102,7 +106,7 @@ class Lookups(object):
        """
        for key, value in srsly.msgpack_loads(bytes_data).items():
            self._tables[key] = Table(key)
-            self._tables[key].update_raw(value)
+            self._tables[key].update(value)
        return self

    def to_disk(self, path, **kwargs):
@ -163,72 +167,85 @@ class Table(OrderedDict):
        """
        OrderedDict.__init__(self)
        self.name = name
-        # assume a default size of 1M items
-        size = 1E6
-        if data and len(data) > 0:
-            size = len(data)
-
+        # Assume a default size of 1M items
+        self.default_size = 1e6
+        size = len(data) if data and len(data) > 0 else self.default_size
        self.bloom = BloomFilter.from_error_rate(size)
-
        if data:
            self.update(data)

-    def set(self, key, value): 
-        """Set new key/value pair, where key is an integer. Same as
-        table[key] = value.
-        """
-        self[key] = value
-
    def __setitem__(self, key, value):
+        """Set new key/value pair. String keys will be hashed.
+
+        key (unicode / int): The key to set.
+        value: The value to set.
+        """
+        key = ensure_hash(key)
        OrderedDict.__setitem__(self, key, value)
        self.bloom.add(key)

-    def set_string(self, key, value):
-        """Set new key/value pair, where key is a string to be hashed.
-        """
-        hkey = hash_string(key)
-        self.set(hkey, value)
+    def set(self, key, value):
+        """Set new key/value pair. String keys will be hashed.
+        Same as table[key] = value.

-    def update(self, data):
-        """Add entries in a dict-like to the table, where keys are strings to
-        be hashed.
+        key (unicode / int): The key to set.
+        value: The value to set.
        """
-        for key, val in data.items():
-            self.set_string(key, val)
+        self[key] = value

-    def update_raw(self, data):
-        """Add entries in a dict-like to the table, where keys are ints.
+    def __getitem__(self, key):
+        """Get the value for a given key. String keys will be hashed.
+
+        key (unicode / int): The key to get.
+        RETURNS: The value.
        """
-        for key, val in data.items():
-            self.set(key, val)
+        key = ensure_hash(key)
+        return OrderedDict.__getitem__(self, key)

    def get(self, key, default=None):
+        """Get the value for a given key. String keys will be hashed.
+
+        key (unicode / int): The key to get.
+        default: The default value to return.
+        RETURNS: The value.
+        """
+        key = ensure_hash(key)
        return OrderedDict.get(self, key, default)

-    def get_string(self, key, default=None):
-        hkey = hash_string(key)
-        return OrderedDict.get(self, hkey, default)
-
    def __contains__(self, key):
+        """Check whether a key is in the table. String keys will be hashed.
+
+        key (unicode / int): The key to check.
+        RETURNS (bool): Whether the key is in the table.
+        """
+        key = ensure_hash(key)
        # This can give a false positive, so we need to check it after
        if key not in self.bloom:
            return False
        return OrderedDict.__contains__(self, key)

-    def contains_string(self, key):
-        hkey = hash_string(key)
-        return self.__contains__(hkey)
-
    def to_bytes(self):
-        # TODO: serialize bloom too. For now just reconstruct it.
-        return srsly.msgpack_dumps({'name': self.name, 'dict': dict(self.items())})
+        """Serialize table to a bytestring.

-    def from_bytes(self, data):
-        loaded = srsly.msgpack_loads(data)
-        self.name = loaded['name']
-        for key, val in loaded['dict'].items():
-            self[key] = val
-            self.bloom.add(key)
+        RETURNS (bytes): The serialized table.
+        """
+        data = [
+            ("name", self.name),
+            ("dict", dict(self.items())),
+            ("bloom", self.bloom.to_bytes()),
+        ]
+        return srsly.msgpack_dumps(OrderedDict(data))

+    def from_bytes(self, bytes_data):
+        """Load a table from a bytestring.
+
+        bytes_data (bytes): The data to load.
+        RETURNS (Table): The loaded table.
+        """
+        loaded = srsly.msgpack_loads(bytes_data)
+        data = loaded.get("dict", {})
+        self.name = loaded["name"]
+        self.bloom = BloomFilter().from_bytes(loaded["bloom"])
+        self.clear()
+        self.update(data)
        return self
-
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -273,7 +273,7 @@ cdef class Morphology:
        """
        if token.lemma == 0:
            orth_str = self.strings[token.lex.orth]
-            lemma = self.lemmatizer.lookup(token.lex.orth, orth_str)
+            lemma = self.lemmatizer.lookup(orth_str, orth=token.lex.orth)
            token.lemma = self.strings.add(lemma)

    cdef int assign_tag(self, TokenC* token, tag_str) except -1:
--- a/spacy/tests/lang/lt/test_lemmatizer.py
+++ b/spacy/tests/lang/lt/test_lemmatizer.py
@ -17,6 +17,4 @@ TEST_CASES = [

@pytest.mark.parametrize("tokens,lemmas", TEST_CASES)
 def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas):
-    assert lemmas == [
-        lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens
-    ]
+    assert lemmas == [lt_lemmatizer.lookup_table.get(token, token) for token in tokens]
--- a/spacy/tests/lang/nl/test_lemmatizer.py
+++ b/spacy/tests/lang/nl/test_lemmatizer.py
@ -133,11 +133,11 @@ def test_nl_lemmatizer_pronoun_lemmas(nl_lemmatizer, text, lemma):
 # Using the lemma lookup table only
@pytest.mark.parametrize("text,lemma", noun_irreg_lemmatization_cases)
 def test_nl_lemmatizer_lookup_noun(nl_lemmatizer, text, lemma):
-    lemma_pred = nl_lemmatizer.lookup(None, text)
+    lemma_pred = nl_lemmatizer.lookup(text)
    assert lemma_pred in (lemma, text)


@pytest.mark.parametrize("text,lemma", verb_irreg_lemmatization_cases)
 def test_nl_lemmatizer_lookup_verb(nl_lemmatizer, text, lemma):
-    lemma_pred = nl_lemmatizer.lookup(None, text)
+    lemma_pred = nl_lemmatizer.lookup(text)
    assert lemma_pred in (lemma, text)
--- a/spacy/tests/vocab_vectors/test_lookups.py
+++ b/spacy/tests/vocab_vectors/test_lookups.py
@ -2,7 +2,7 @@
 from __future__ import unicode_literals

 import pytest
-from spacy.lookups import Lookups
+from spacy.lookups import Lookups, Table, ensure_hash
 from spacy.vocab import Vocab

 from ..util import make_tempdir
@ -19,9 +19,9 @@ def test_lookups_api():
    table = lookups.get_table(table_name)
    assert table.name == table_name
    assert len(table) == 2
-    assert table.get_string("hello") == "world"
-    table.set_string("a", "b")
-    assert table.get_string("a") == "b"
+    assert table["hello"] == "world"
+    table["a"] = "b"
+    assert table["a"] == "b"
    table = lookups.get_table(table_name)
    assert len(table) == 3
    with pytest.raises(KeyError):
@ -36,6 +36,43 @@ def test_lookups_api():
        lookups.get_table(table_name)


+def test_table_api():
+    table = Table(name="table")
+    assert table.name == "table"
+    assert len(table) == 0
+    assert "abc" not in table
+    data = {"foo": "bar", "hello": "world"}
+    table = Table(name="table", data=data)
+    assert len(table) == len(data)
+    assert "foo" in table
+    assert ensure_hash("foo") in table
+    assert table["foo"] == "bar"
+    assert table[ensure_hash("foo")] == "bar"
+    assert table.get("foo") == "bar"
+    assert table.get("abc") is None
+    table["abc"] = 123
+    assert table["abc"] == 123
+    assert table[ensure_hash("abc")] == 123
+    table.set("def", 456)
+    assert table["def"] == 456
+    assert table[ensure_hash("def")] == 456
+
+
+def test_table_api_to_from_bytes():
+    data = {"foo": "bar", "hello": "world", "abc": 123}
+    table = Table(name="table", data=data)
+    table_bytes = table.to_bytes()
+    new_table = Table().from_bytes(table_bytes)
+    assert new_table.name == "table"
+    assert len(new_table) == 3
+    assert new_table["foo"] == "bar"
+    assert new_table[ensure_hash("foo")] == "bar"
+    new_table2 = Table(data={"def": 456})
+    new_table2.from_bytes(table_bytes)
+    assert len(new_table2) == 3
+    assert "def" not in new_table2
+
+
@pytest.mark.skip(reason="This fails on Python 3.5")
 def test_lookups_to_from_bytes():
    lookups = Lookups()
@ -49,10 +86,10 @@ def test_lookups_to_from_bytes():
    assert "table2" in new_lookups
    table1 = new_lookups.get_table("table1")
    assert len(table1) == 2
-    assert table1.get_string("foo") == "bar"
+    assert table1["foo"] == "bar"
    table2 = new_lookups.get_table("table2")
    assert len(table2) == 3
-    assert table2.get_string("b") == 2
+    assert table2["b"] == 2
    assert new_lookups.to_bytes() == lookups_bytes


@ -70,10 +107,10 @@ def test_lookups_to_from_disk():
    assert "table2" in new_lookups
    table1 = new_lookups.get_table("table1")
    assert len(table1) == 2
-    assert table1.get_string("foo") == "bar"
+    assert table1["foo"] == "bar"
    table2 = new_lookups.get_table("table2")
    assert len(table2) == 3
-    assert table2.get_string("b") == 2
+    assert table2["b"] == 2


@pytest.mark.skip(reason="This fails on Python 3.5")
@ -90,7 +127,7 @@ def test_lookups_to_from_bytes_via_vocab():
    assert table_name in new_vocab.lookups
    table = new_vocab.lookups.get_table(table_name)
    assert len(table) == 2
-    assert table.get_string("hello") == "world"
+    assert table["hello"] == "world"
    assert new_vocab.to_bytes() == vocab_bytes


@ -109,4 +146,4 @@ def test_lookups_to_from_disk_via_vocab():
    assert table_name in new_vocab.lookups
    table = new_vocab.lookups.get_table(table_name)
    assert len(table) == 2
-    assert table.get_string("hello") == "world"
+    assert table["hello"] == "world"
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -335,7 +335,7 @@ cdef class Token:
        """
        def __get__(self):
            if self.c.lemma == 0:
-                lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth, self.orth_)
+                lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth)
                return self.vocab.strings[lemma_]
            else:
                return self.c.lemma
@ -862,7 +862,7 @@ cdef class Token:
        """
        def __get__(self):
            if self.c.lemma == 0:
-                return self.vocab.morphology.lemmatizer.lookup(self.orth, self.orth_)
+                return self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth)
            else:
                return self.vocab.strings[self.c.lemma]

--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@ -52,8 +52,8 @@ Lemmatize a string.

 Look up a lemma in the lookup table, if available. If no lemma is found, the
 original string is returned. Languages can provide a
-[lookup table](/usage/adding-languages#lemmatizer) via the `lemma_lookup`
-variable, set on the individual `Language` class.
+[lookup table](/usage/adding-languages#lemmatizer) via the `resources`, set on
+the individual `Language` class.

 > #### Example
 >
@ -63,10 +63,11 @@ variable, set on the individual `Language` class.
 > assert lemmatizer.lookup("going") == "go"
 > ```

-| Name        | Type    | Description                                                       |
-| ----------- | ------- | ----------------------------------------------------------------- |
-| `string`    | unicode | The string to look up.                                            |
-| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. |
+| Name        | Type    | Description                                                                                                 |
+| ----------- | ------- | ----------------------------------------------------------------------------------------------------------- |
+| `string`    | unicode | The string to look up.                                                                                      |
+| `orth`      | int     | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. |
+| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string.                                           |

 ## Lemmatizer.is_base_form {#is_base_form tag="method"}

--- a/website/docs/api/lookups.md
+++ b/website/docs/api/lookups.md
@ -7,10 +7,11 @@ new: 2.2
 ---

 This class allows convenient accesss to large lookup tables and dictionaries,
-e.g. lemmatization data or tokenizer exception lists. Lookups are available via
-the [`Vocab`](/api/vocab) as `vocab.lookups`, so they can be accessed before the
-pipeline components are applied (e.g. in the tokenizer and lemmatizer), as well
-as within the pipeline components via `doc.vocab.lookups`.
+e.g. lemmatization data or tokenizer exception lists using Bloom filters.
+Lookups are available via the [`Vocab`](/api/vocab) as `vocab.lookups`, so they
+can be accessed before the pipeline components are applied (e.g. in the
+tokenizer and lemmatizer), as well as within the pipeline components via
+`doc.vocab.lookups`.

 ## Lookups.\_\_init\_\_ {#init tag="method"}

@ -215,8 +216,11 @@ the file doesn't exist.
 ## Table {#table tag="class, ordererddict"}

 A table in the lookups. Subclass of `OrderedDict` that implements a slightly
-more consistent and unified API. Supports all other methods and attributes of
-`OrderedDict` / `dict`, and the customized methods listed here.
+more consistent and unified API and includes a Bloom filter to speed up missed
+lookups. Supports **all other methods and attributes** of `OrderedDict` /
+`dict`, and the customized methods listed here. Methods that get or set keys
+accept both integers and strings (which will be hashed before being added to the
+table).

 ### Table.\_\_init\_\_ {#table.init tag="method"}

@ -226,7 +230,10 @@ Initialize a new table.
 >
 > ```python
 > from spacy.lookups import Table
-> table = Table(name="some_table")
+> data = {"foo": "bar", "baz": 100}
+> table = Table(name="some_table", data=data)
+> assert "foo" in table
+> assert table["foo"] == "bar"
 > ```

 | Name        | Type    | Description                        |
@ -252,9 +259,10 @@ Initialize a new table from a dict.
 | `name`      | unicode | Optional table name for reference. |
 | **RETURNS** | `Table` | The newly constructed object.      |

-### Table.set {#table.set tag="key"}
+### Table.set {#table.set tag="method"}

-Set a new key / value pair. Same as `table[key] = value`.
+Set a new key / value pair. String keys will be hashed. Same as
+`table[key] = value`.

 > #### Example
 >
@ -265,7 +273,46 @@ Set a new key / value pair. Same as `table[key] = value`.
 > assert table["foo"] == "bar"
 > ```

-| Name    | Type    | Description |
-| ------- | ------- | ----------- |
-| `key`   | unicode | The key.    |
-| `value` | -       | The value.  |
+| Name    | Type          | Description |
+| ------- | ------------- | ----------- |
+| `key`   | unicode / int | The key.    |
+| `value` | -             | The value.  |
+
+### Table.to_bytes {#table.to_bytes tag="method"}
+
+Serialize the table to a bytestring.
+
+> #### Example
+>
+> ```python
+> table_bytes = table.to_bytes()
+> ```
+
+| Name        | Type  | Description           |
+| ----------- | ----- | --------------------- |
+| **RETURNS** | bytes | The serialized table. |
+
+### Table.from_bytes {#table.from_bytes tag="method"}
+
+Load a table from a bytestring.
+
+> #### Example
+>
+> ```python
+> table_bytes = table.to_bytes()
+> table = Table()
+> table.from_bytes(table_bytes)
+> ```
+
+| Name         | Type    | Description       |
+| ------------ | ------- | ----------------- |
+| `bytes_data` | bytes   | The data to load. |
+| **RETURNS**  | `Table` | The loaded table. |
+
+### Attributes {#table-attributes}
+
+| Name           | Type                        | Description                                           |
+| -------------- | --------------------------- | ----------------------------------------------------- |
+| `name`         | unicode                     | Table name.                                           |
+| `default_size` | int                         | Default size of bloom filters if no data is provided. |
+| `bloom`        | `preshed.bloom.BloomFilter` | The bloom filters.                                    |