From 7d8df6915865b76d6ad44f6c8bbd7e6ddcd1fa06 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 13 Sep 2019 00:26:11 +0900 Subject: [PATCH] Bloom-filter backed Lookup Tables (#4268) * Improve load_language_data helper * WIP: Add Lookups implementation * Start moving lemma data over to JSON * WIP: move data over for more languages * Convert more languages * Fix lemmatizer fixtures in tests * Finish conversion * Auto-format JSON files * Fix test for now * Make sure tables are stored on instance * Update docstrings * Update docstrings and errors * Update test * Add Lookups.__len__ * Add serialization methods * Add Lookups.remove_table * Use msgpack for serialization to disk * Fix file exists check * Try using OrderedDict for everything * Update .flake8 [ci skip] * Try fixing serialization * Update test_lookups.py * Update test_serialize_vocab_strings.py * Lookups / Tables now work This implements the stubs in the Lookups/Table classes. Currently this is in Cython but with no type declarations, so that could be improved. * Add lookups to setup.py * Actually add lookups pyx The previous commit added the old py file... * Lookups work-in-progress * Move from pyx back to py * Add string based lookups, fix serialization * Update tests, language/lemmatizer to work with string lookups There are some outstanding issues here: - a pickling-related test fails due to the bloom filter - some custom lemmatizers (fr/nl at least) have issues More generally, there's a question of how to deal with the case where you have a string but want to use the lookup table. Currently the table allows access by string or id, but that's getting pretty awkward. * Change lemmatizer lookup method to pass (orth, string) * Fix token lookup * Fix French lookup * Fix lt lemmatizer test * Fix Dutch lemmatizer * Fix lemmatizer lookup test This was using a normal dict instead of a Table, so checks for the string instead of an integer key failed. * Make uk/nl/ru lemmatizer lookup methods consistent The mentioned tokenizers all have their own implementation of the `lookup` method, which accesses a `Lookups` table. The way that was called in `token.pyx` was changed so this should be updated to have the same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id, string)). Prior to this change tests weren't failing, but there would probably be issues with normal use of a model. More tests should proably be added. Additionally, the language-specific `lookup` implementations seem like they might not be needed, since they handle things like lower-casing that aren't actually language specific. * Make recently added Greek method compatible * Remove redundant class/method Leftovers from a merge not cleaned up adequately. --- spacy/lang/el/lemmatizer/__init__.py | 6 +- spacy/lang/fr/lemmatizer/__init__.py | 8 +- spacy/lang/nl/lemmatizer/__init__.py | 17 ++-- spacy/lang/ru/lemmatizer.py | 2 +- spacy/lang/uk/lemmatizer.py | 2 +- spacy/language.py | 1 + spacy/lemmatizer.py | 6 +- spacy/lookups.py | 99 +++++++++++++++++++---- spacy/morphology.pyx | 2 +- spacy/tests/doc/test_creation.py | 4 +- spacy/tests/lang/lt/test_lemmatizer.py | 2 +- spacy/tests/lang/nl/test_lemmatizer.py | 4 +- spacy/tests/vocab_vectors/test_lookups.py | 20 ++--- spacy/tokens/token.pyx | 4 +- spacy/vocab.pyx | 2 +- 15 files changed, 126 insertions(+), 53 deletions(-) diff --git a/spacy/lang/el/lemmatizer/__init__.py b/spacy/lang/el/lemmatizer/__init__.py index c0ce5c2ad..bc5c00bd8 100644 --- a/spacy/lang/el/lemmatizer/__init__.py +++ b/spacy/lang/el/lemmatizer/__init__.py @@ -46,9 +46,9 @@ class GreekLemmatizer(object): ) return lemmas - def lookup(self, string): - if string in self.lookup_table: - return self.lookup_table[string] + def lookup(self, orth, string): + if orth in self.lookup_table: + return self.lookup_table[orth] return string diff --git a/spacy/lang/fr/lemmatizer/__init__.py b/spacy/lang/fr/lemmatizer/__init__.py index a0a0d2021..879f2c80c 100644 --- a/spacy/lang/fr/lemmatizer/__init__.py +++ b/spacy/lang/fr/lemmatizer/__init__.py @@ -52,7 +52,7 @@ class FrenchLemmatizer(object): elif univ_pos in (SCONJ, "SCONJ", "sconj"): univ_pos = "sconj" else: - return [self.lookup(string)] + return [self.lookup(None, string)] # See Issue #435 for example of where this logic is requied. if self.is_base_form(univ_pos, morphology): return list(set([string.lower()])) @@ -114,9 +114,9 @@ class FrenchLemmatizer(object): def punct(self, string, morphology=None): return self(string, "punct", morphology) - def lookup(self, string): - if string in self.lookup_table: - return self.lookup_table[string][0] + def lookup(self, orth, string): + if orth is not None and orth in self.lookup_table: + return self.lookup_table[orth][0] return string diff --git a/spacy/lang/nl/lemmatizer/__init__.py b/spacy/lang/nl/lemmatizer/__init__.py index 1e5d9aa1f..db345c088 100644 --- a/spacy/lang/nl/lemmatizer/__init__.py +++ b/spacy/lang/nl/lemmatizer/__init__.py @@ -62,25 +62,25 @@ class DutchLemmatizer(object): # are not lemmatized. They are lowercased, however. return [string] # if string in self.lemma_index.get(univ_pos) - lemma_index = self.index.get(univ_pos, {}) + lemma_index = self.index.get_string(univ_pos, {}) # string is already lemma if string in lemma_index: return [string] - exceptions = self.exc.get(univ_pos, {}) + exceptions = self.exc.get_string(univ_pos, {}) # string is irregular token contained in exceptions index. try: lemma = exceptions[string] return [lemma[0]] except KeyError: pass - # string corresponds to key in lookup table + # string corresponds to key in lookup table lookup_table = self.lookup_table - looked_up_lemma = lookup_table.get(string) + looked_up_lemma = lookup_table.get_string(string) if looked_up_lemma and looked_up_lemma in lemma_index: return [looked_up_lemma] forms, is_known = lemmatize( - string, lemma_index, exceptions, self.rules.get(univ_pos, []) + string, lemma_index, exceptions, self.rules.get_string(univ_pos, []) ) # Back-off through remaining return value candidates. @@ -103,9 +103,12 @@ class DutchLemmatizer(object): # Overrides parent method so that a lowercased version of the string is # used to search the lookup table. This is necessary because our lookup # table consists entirely of lowercase keys. - def lookup(self, string): + def lookup(self, orth, string): string = string.lower() - return self.lookup_table.get(string, string) + if orth is not None: + return self.lookup_table.get(orth, string) + else: + return self.lookup_table.get_string(string, string) def noun(self, string, morphology=None): return self(string, "noun", morphology) diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 300d61c52..9fc600eb8 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -115,7 +115,7 @@ class RussianLemmatizer(Lemmatizer): def pron(self, string, morphology=None): return self(string, "pron", morphology) - def lookup(self, string): + def lookup(self, orth, string): analyses = self._morph.parse(string) if len(analyses) == 1: return analyses[0].normal_form diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index ab56c824d..ea2c32ee3 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -112,7 +112,7 @@ class UkrainianLemmatizer(Lemmatizer): def pron(self, string, morphology=None): return self(string, "pron", morphology) - def lookup(self, string): + def lookup(self, orth, string): analyses = self._morph.parse(string) if len(analyses) == 1: return analyses[0].normal_form diff --git a/spacy/language.py b/spacy/language.py index 7292e3bf6..f966a6630 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -32,6 +32,7 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP from .lang.lex_attrs import LEX_ATTRS, is_stop from .errors import Errors, Warnings, deprecation_warning +from .strings import hash_string from . import util from . import about diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index d14f5292e..cfedd7a9d 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -93,9 +93,9 @@ class Lemmatizer(object): def punct(self, string, morphology=None): return self(string, "punct", morphology) - def lookup(self, string): - if string in self.lookup_table: - return self.lookup_table[string] + def lookup(self, orth, string): + if orth in self.lookup_table: + return self.lookup_table[orth] return string diff --git a/spacy/lookups.py b/spacy/lookups.py index a6fa7abff..b3b67ae7b 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -1,4 +1,4 @@ -# coding: utf8 +# coding: utf-8 from __future__ import unicode_literals import srsly @@ -6,7 +6,12 @@ from collections import OrderedDict from .errors import Errors from .util import SimpleFrozenDict, ensure_path +from .strings import hash_string +from . import util + +import srsly +from preshed.bloom import BloomFilter class Lookups(object): """Container for large lookup tables and dictionaries, e.g. lemmatization @@ -14,10 +19,6 @@ class Lookups(object): so they can be accessed before the pipeline components are applied (e.g. in the tokenizer and lemmatizer), as well as within the pipeline components via doc.vocab.lookups. - - Important note: At the moment, this class only performs a very basic - dictionary lookup. We're planning to replace this with a more efficient - implementation. See #3971 for details. """ def __init__(self): @@ -54,8 +55,7 @@ class Lookups(object): """ if name in self.tables: raise ValueError(Errors.E158.format(name=name)) - table = Table(name=name) - table.update(data) + table = Table(name=name, data=data) self._tables[name] = table return table @@ -100,10 +100,9 @@ class Lookups(object): bytes_data (bytes): The data to load. RETURNS (Lookups): The loaded Lookups. """ - self._tables = OrderedDict() - msg = srsly.msgpack_loads(bytes_data) - for key, value in msg.items(): - self._tables[key] = Table.from_dict(value) + for key, value in srsly.msgpack_loads(bytes_data).items(): + self._tables[key] = Table(key) + self._tables[key].update_raw(value) return self def to_disk(self, path, **kwargs): @@ -137,8 +136,10 @@ class Lookups(object): class Table(OrderedDict): - """A table in the lookups. Subclass of OrderedDict that implements a - slightly more consistent and unified API. + """A table in the lookups. Subclass of builtin dict that implements a + slightly more consistent and unified API. + + Includes a Bloom filter to speed up missed lookups. """ @classmethod @@ -153,15 +154,81 @@ class Table(OrderedDict): self.update(data) return self - def __init__(self, name=None): + def __init__(self, name=None, data=None): """Initialize a new table. name (unicode): Optional table name for reference. + data (dict): Initial data, used to hint Bloom Filter. RETURNS (Table): The newly created object. """ OrderedDict.__init__(self) self.name = name + # assume a default size of 1M items + size = 1E6 + if data and len(data) > 0: + size = len(data) - def set(self, key, value): - """Set new key/value pair. Same as table[key] = value.""" + self.bloom = BloomFilter.from_error_rate(size) + + if data: + self.update(data) + + def set(self, key, value): + """Set new key/value pair, where key is an integer. Same as + table[key] = value. + """ self[key] = value + + def __setitem__(self, key, value): + OrderedDict.__setitem__(self, key, value) + self.bloom.add(key) + + def set_string(self, key, value): + """Set new key/value pair, where key is a string to be hashed. + """ + hkey = hash_string(key) + self.set(hkey, value) + + def update(self, data): + """Add entries in a dict-like to the table, where keys are strings to + be hashed. + """ + for key, val in data.items(): + self.set_string(key, val) + + def update_raw(self, data): + """Add entries in a dict-like to the table, where keys are ints. + """ + for key, val in data.items(): + self.set(key, val) + + def get(self, key, default=None): + return OrderedDict.get(self, key, default) + + def get_string(self, key, default=None): + hkey = hash_string(key) + return OrderedDict.get(self, hkey, default) + + def __contains__(self, key): + # This can give a false positive, so we need to check it after + if key not in self.bloom: + return False + return OrderedDict.__contains__(self, key) + + def contains_string(self, key): + hkey = hash_string(key) + return self.__contains__(hkey) + + def to_bytes(self): + # TODO: serialize bloom too. For now just reconstruct it. + return srsly.msgpack_dumps({'name': self.name, 'dict': dict(self.items())}) + + def from_bytes(self, data): + loaded = srsly.msgpack_loads(data) + self.name = loaded['name'] + for key, val in loaded['dict'].items(): + self[key] = val + self.bloom.add(key) + + return self + diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index bf7aaced0..8cc27fb7d 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -273,7 +273,7 @@ cdef class Morphology: """ if token.lemma == 0: orth_str = self.strings[token.lex.orth] - lemma = self.lemmatizer.lookup(orth_str) + lemma = self.lemmatizer.lookup(token.lex.orth, orth_str) token.lemma = self.strings.add(lemma) cdef int assign_tag(self, TokenC* token, tag_str) except -1: diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index ce42b39b9..b222f6bf0 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -5,11 +5,13 @@ import pytest from spacy.vocab import Vocab from spacy.tokens import Doc from spacy.lemmatizer import Lemmatizer +from spacy.lookups import Table @pytest.fixture def lemmatizer(): - return Lemmatizer(lookup={"dogs": "dog", "boxen": "box", "mice": "mouse"}) + lookup = Table(data={"dogs": "dog", "boxen": "box", "mice": "mouse"}) + return Lemmatizer(lookup=lookup) @pytest.fixture diff --git a/spacy/tests/lang/lt/test_lemmatizer.py b/spacy/tests/lang/lt/test_lemmatizer.py index 9b2969849..5c3ed34f8 100644 --- a/spacy/tests/lang/lt/test_lemmatizer.py +++ b/spacy/tests/lang/lt/test_lemmatizer.py @@ -17,4 +17,4 @@ TEST_CASES = [ @pytest.mark.parametrize("tokens,lemmas", TEST_CASES) def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas): - assert lemmas == [lt_lemmatizer.lookup(token) for token in tokens] + assert lemmas == [lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens] diff --git a/spacy/tests/lang/nl/test_lemmatizer.py b/spacy/tests/lang/nl/test_lemmatizer.py index dae9091b7..93dd1e5e3 100644 --- a/spacy/tests/lang/nl/test_lemmatizer.py +++ b/spacy/tests/lang/nl/test_lemmatizer.py @@ -133,11 +133,11 @@ def test_nl_lemmatizer_pronoun_lemmas(nl_lemmatizer, text, lemma): # Using the lemma lookup table only @pytest.mark.parametrize("text,lemma", noun_irreg_lemmatization_cases) def test_nl_lemmatizer_lookup_noun(nl_lemmatizer, text, lemma): - lemma_pred = nl_lemmatizer.lookup(text) + lemma_pred = nl_lemmatizer.lookup(None, text) assert lemma_pred in (lemma, text) @pytest.mark.parametrize("text,lemma", verb_irreg_lemmatization_cases) def test_nl_lemmatizer_lookup_verb(nl_lemmatizer, text, lemma): - lemma_pred = nl_lemmatizer.lookup(text) + lemma_pred = nl_lemmatizer.lookup(None, text) assert lemma_pred in (lemma, text) diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py index 16ffe83fc..7cdf8ff68 100644 --- a/spacy/tests/vocab_vectors/test_lookups.py +++ b/spacy/tests/vocab_vectors/test_lookups.py @@ -19,9 +19,9 @@ def test_lookups_api(): table = lookups.get_table(table_name) assert table.name == table_name assert len(table) == 2 - assert table.get("hello") == "world" - table.set("a", "b") - assert table.get("a") == "b" + assert table.get_string("hello") == "world" + table.set_string("a", "b") + assert table.get_string("a") == "b" table = lookups.get_table(table_name) assert len(table) == 3 with pytest.raises(KeyError): @@ -50,10 +50,10 @@ def test_lookups_to_from_bytes(): assert "table2" in new_lookups table1 = new_lookups.get_table("table1") assert len(table1) == 2 - assert table1.get("foo") == "bar" + assert table1.get_string("foo") == "bar" table2 = new_lookups.get_table("table2") assert len(table2) == 3 - assert table2.get("b") == 2 + assert table2.get_string("b") == 2 assert new_lookups.to_bytes() == lookups_bytes @@ -72,10 +72,11 @@ def test_lookups_to_from_disk(): assert "table2" in new_lookups table1 = new_lookups.get_table("table1") assert len(table1) == 2 - assert table1.get("foo") == "bar" + assert table1.get_string("foo") == "bar" table2 = new_lookups.get_table("table2") assert len(table2) == 3 - assert table2.get("b") == 2 + assert table2.get_string("b") == 2 + # This fails on Python 3.5 @@ -93,10 +94,9 @@ def test_lookups_to_from_bytes_via_vocab(): assert table_name in new_vocab.lookups table = new_vocab.lookups.get_table(table_name) assert len(table) == 2 - assert table.get("hello") == "world" + assert table.get_string("hello") == "world" assert new_vocab.to_bytes() == vocab_bytes - # This fails on Python 3.5 @pytest.mark.xfail def test_lookups_to_from_disk_via_vocab(): @@ -113,4 +113,4 @@ def test_lookups_to_from_disk_via_vocab(): assert table_name in new_vocab.lookups table = new_vocab.lookups.get_table(table_name) assert len(table) == 2 - assert table.get("hello") == "world" + assert table.get_string("hello") == "world" diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index c7a44f5ca..dfe42d2bd 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -335,7 +335,7 @@ cdef class Token: """ def __get__(self): if self.c.lemma == 0: - lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_) + lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth, self.orth_) return self.vocab.strings[lemma_] else: return self.c.lemma @@ -862,7 +862,7 @@ cdef class Token: """ def __get__(self): if self.c.lemma == 0: - return self.vocab.morphology.lemmatizer.lookup(self.orth_) + return self.vocab.morphology.lemmatizer.lookup(self.orth, self.orth_) else: return self.vocab.strings[self.c.lemma] diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 7e360d409..021da02fc 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -18,10 +18,10 @@ from .structs cimport SerializedLexemeC from .compat import copy_reg, basestring_ from .errors import Errors from .lemmatizer import Lemmatizer -from .lookups import Lookups from .attrs import intify_attrs, NORM from .vectors import Vectors from ._ml import link_vectors_to_models +from .lookups import Lookups from . import util