diff --git a/spacy/lang/el/lemmatizer/__init__.py b/spacy/lang/el/lemmatizer/__init__.py index bc5c00bd8..994bf9c16 100644 --- a/spacy/lang/el/lemmatizer/__init__.py +++ b/spacy/lang/el/lemmatizer/__init__.py @@ -46,9 +46,10 @@ class GreekLemmatizer(object): ) return lemmas - def lookup(self, orth, string): - if orth in self.lookup_table: - return self.lookup_table[orth] + def lookup(self, string, orth=None): + key = orth if orth is not None else string + if key in self.lookup_table: + return self.lookup_table[key] return string diff --git a/spacy/lang/fr/lemmatizer/__init__.py b/spacy/lang/fr/lemmatizer/__init__.py index 879f2c80c..dfd822188 100644 --- a/spacy/lang/fr/lemmatizer/__init__.py +++ b/spacy/lang/fr/lemmatizer/__init__.py @@ -52,7 +52,7 @@ class FrenchLemmatizer(object): elif univ_pos in (SCONJ, "SCONJ", "sconj"): univ_pos = "sconj" else: - return [self.lookup(None, string)] + return [self.lookup(string)] # See Issue #435 for example of where this logic is requied. if self.is_base_form(univ_pos, morphology): return list(set([string.lower()])) @@ -114,7 +114,7 @@ class FrenchLemmatizer(object): def punct(self, string, morphology=None): return self(string, "punct", morphology) - def lookup(self, orth, string): + def lookup(self, string, orth=None): if orth is not None and orth in self.lookup_table: return self.lookup_table[orth][0] return string diff --git a/spacy/lang/nl/lemmatizer/__init__.py b/spacy/lang/nl/lemmatizer/__init__.py index db345c088..ee4eaabb3 100644 --- a/spacy/lang/nl/lemmatizer/__init__.py +++ b/spacy/lang/nl/lemmatizer/__init__.py @@ -62,11 +62,11 @@ class DutchLemmatizer(object): # are not lemmatized. They are lowercased, however. return [string] # if string in self.lemma_index.get(univ_pos) - lemma_index = self.index.get_string(univ_pos, {}) + lemma_index = self.index.get(univ_pos, {}) # string is already lemma if string in lemma_index: return [string] - exceptions = self.exc.get_string(univ_pos, {}) + exceptions = self.exc.get(univ_pos, {}) # string is irregular token contained in exceptions index. try: lemma = exceptions[string] @@ -75,12 +75,12 @@ class DutchLemmatizer(object): pass # string corresponds to key in lookup table lookup_table = self.lookup_table - looked_up_lemma = lookup_table.get_string(string) + looked_up_lemma = lookup_table.get(string) if looked_up_lemma and looked_up_lemma in lemma_index: return [looked_up_lemma] forms, is_known = lemmatize( - string, lemma_index, exceptions, self.rules.get_string(univ_pos, []) + string, lemma_index, exceptions, self.rules.get(univ_pos, []) ) # Back-off through remaining return value candidates. @@ -103,12 +103,12 @@ class DutchLemmatizer(object): # Overrides parent method so that a lowercased version of the string is # used to search the lookup table. This is necessary because our lookup # table consists entirely of lowercase keys. - def lookup(self, orth, string): + def lookup(self, string, orth=None): string = string.lower() if orth is not None: return self.lookup_table.get(orth, string) else: - return self.lookup_table.get_string(string, string) + return self.lookup_table.get(string, string) def noun(self, string, morphology=None): return self(string, "noun", morphology) diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 9fc600eb8..638565b6c 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -115,7 +115,7 @@ class RussianLemmatizer(Lemmatizer): def pron(self, string, morphology=None): return self(string, "pron", morphology) - def lookup(self, orth, string): + def lookup(self, string, orth=None): analyses = self._morph.parse(string) if len(analyses) == 1: return analyses[0].normal_form diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index ea2c32ee3..cf7591ea8 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -112,7 +112,7 @@ class UkrainianLemmatizer(Lemmatizer): def pron(self, string, morphology=None): return self(string, "pron", morphology) - def lookup(self, orth, string): + def lookup(self, string, orth=None): analyses = self._morph.parse(string) if len(analyses) == 1: return analyses[0].normal_form diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index cfedd7a9d..26c2227a0 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -93,9 +93,19 @@ class Lemmatizer(object): def punct(self, string, morphology=None): return self(string, "punct", morphology) - def lookup(self, orth, string): - if orth in self.lookup_table: - return self.lookup_table[orth] + def lookup(self, string, orth=None): + """Look up a lemma in the table, if available. If no lemma is found, + the original string is returned. + + string (unicode): The original string. + orth (int): Optional hash of the string to look up. If not set, the + string will be used and hashed. + RETURNS (unicode): The lemma if the string was found, otherwise the + original string. + """ + key = orth if orth is not None else string + if key in self.lookup_table: + return self.lookup_table[key] return string diff --git a/spacy/lookups.py b/spacy/lookups.py index b3b67ae7b..7d100520f 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -3,15 +3,19 @@ from __future__ import unicode_literals import srsly from collections import OrderedDict +from preshed.bloom import BloomFilter from .errors import Errors from .util import SimpleFrozenDict, ensure_path -from .strings import hash_string +from .compat import basestring_ +from .strings import get_string_id -from . import util -import srsly -from preshed.bloom import BloomFilter +def ensure_hash(key): + if isinstance(key, basestring_): + return get_string_id(key) + return key + class Lookups(object): """Container for large lookup tables and dictionaries, e.g. lemmatization @@ -102,7 +106,7 @@ class Lookups(object): """ for key, value in srsly.msgpack_loads(bytes_data).items(): self._tables[key] = Table(key) - self._tables[key].update_raw(value) + self._tables[key].update(value) return self def to_disk(self, path, **kwargs): @@ -137,7 +141,7 @@ class Lookups(object): class Table(OrderedDict): """A table in the lookups. Subclass of builtin dict that implements a - slightly more consistent and unified API. + slightly more consistent and unified API. Includes a Bloom filter to speed up missed lookups. """ @@ -163,72 +167,85 @@ class Table(OrderedDict): """ OrderedDict.__init__(self) self.name = name - # assume a default size of 1M items - size = 1E6 - if data and len(data) > 0: - size = len(data) - + # Assume a default size of 1M items + self.default_size = 1e6 + size = len(data) if data and len(data) > 0 else self.default_size self.bloom = BloomFilter.from_error_rate(size) - if data: self.update(data) - def set(self, key, value): - """Set new key/value pair, where key is an integer. Same as - table[key] = value. - """ - self[key] = value - def __setitem__(self, key, value): + """Set new key/value pair. String keys will be hashed. + + key (unicode / int): The key to set. + value: The value to set. + """ + key = ensure_hash(key) OrderedDict.__setitem__(self, key, value) self.bloom.add(key) - def set_string(self, key, value): - """Set new key/value pair, where key is a string to be hashed. - """ - hkey = hash_string(key) - self.set(hkey, value) + def set(self, key, value): + """Set new key/value pair. String keys will be hashed. + Same as table[key] = value. - def update(self, data): - """Add entries in a dict-like to the table, where keys are strings to - be hashed. + key (unicode / int): The key to set. + value: The value to set. """ - for key, val in data.items(): - self.set_string(key, val) + self[key] = value - def update_raw(self, data): - """Add entries in a dict-like to the table, where keys are ints. + def __getitem__(self, key): + """Get the value for a given key. String keys will be hashed. + + key (unicode / int): The key to get. + RETURNS: The value. """ - for key, val in data.items(): - self.set(key, val) + key = ensure_hash(key) + return OrderedDict.__getitem__(self, key) def get(self, key, default=None): + """Get the value for a given key. String keys will be hashed. + + key (unicode / int): The key to get. + default: The default value to return. + RETURNS: The value. + """ + key = ensure_hash(key) return OrderedDict.get(self, key, default) - def get_string(self, key, default=None): - hkey = hash_string(key) - return OrderedDict.get(self, hkey, default) - def __contains__(self, key): + """Check whether a key is in the table. String keys will be hashed. + + key (unicode / int): The key to check. + RETURNS (bool): Whether the key is in the table. + """ + key = ensure_hash(key) # This can give a false positive, so we need to check it after - if key not in self.bloom: + if key not in self.bloom: return False return OrderedDict.__contains__(self, key) - def contains_string(self, key): - hkey = hash_string(key) - return self.__contains__(hkey) - def to_bytes(self): - # TODO: serialize bloom too. For now just reconstruct it. - return srsly.msgpack_dumps({'name': self.name, 'dict': dict(self.items())}) + """Serialize table to a bytestring. - def from_bytes(self, data): - loaded = srsly.msgpack_loads(data) - self.name = loaded['name'] - for key, val in loaded['dict'].items(): - self[key] = val - self.bloom.add(key) + RETURNS (bytes): The serialized table. + """ + data = [ + ("name", self.name), + ("dict", dict(self.items())), + ("bloom", self.bloom.to_bytes()), + ] + return srsly.msgpack_dumps(OrderedDict(data)) + def from_bytes(self, bytes_data): + """Load a table from a bytestring. + + bytes_data (bytes): The data to load. + RETURNS (Table): The loaded table. + """ + loaded = srsly.msgpack_loads(bytes_data) + data = loaded.get("dict", {}) + self.name = loaded["name"] + self.bloom = BloomFilter().from_bytes(loaded["bloom"]) + self.clear() + self.update(data) return self - diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 8cc27fb7d..190ca8d00 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -72,7 +72,7 @@ def _normalize_props(props): for key in FIELDS: if key in props: value = str(props[key]).lower() - # We don't have support for disjunctive int|rel features, so + # We don't have support for disjunctive int|rel features, so # just take the first one :( if "|" in value: value = value.split("|")[0] @@ -273,7 +273,7 @@ cdef class Morphology: """ if token.lemma == 0: orth_str = self.strings[token.lex.orth] - lemma = self.lemmatizer.lookup(token.lex.orth, orth_str) + lemma = self.lemmatizer.lookup(orth_str, orth=token.lex.orth) token.lemma = self.strings.add(lemma) cdef int assign_tag(self, TokenC* token, tag_str) except -1: diff --git a/spacy/tests/lang/lt/test_lemmatizer.py b/spacy/tests/lang/lt/test_lemmatizer.py index b98d63935..f7408fc16 100644 --- a/spacy/tests/lang/lt/test_lemmatizer.py +++ b/spacy/tests/lang/lt/test_lemmatizer.py @@ -17,6 +17,4 @@ TEST_CASES = [ @pytest.mark.parametrize("tokens,lemmas", TEST_CASES) def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas): - assert lemmas == [ - lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens - ] + assert lemmas == [lt_lemmatizer.lookup_table.get(token, token) for token in tokens] diff --git a/spacy/tests/lang/nl/test_lemmatizer.py b/spacy/tests/lang/nl/test_lemmatizer.py index 93dd1e5e3..dae9091b7 100644 --- a/spacy/tests/lang/nl/test_lemmatizer.py +++ b/spacy/tests/lang/nl/test_lemmatizer.py @@ -133,11 +133,11 @@ def test_nl_lemmatizer_pronoun_lemmas(nl_lemmatizer, text, lemma): # Using the lemma lookup table only @pytest.mark.parametrize("text,lemma", noun_irreg_lemmatization_cases) def test_nl_lemmatizer_lookup_noun(nl_lemmatizer, text, lemma): - lemma_pred = nl_lemmatizer.lookup(None, text) + lemma_pred = nl_lemmatizer.lookup(text) assert lemma_pred in (lemma, text) @pytest.mark.parametrize("text,lemma", verb_irreg_lemmatization_cases) def test_nl_lemmatizer_lookup_verb(nl_lemmatizer, text, lemma): - lemma_pred = nl_lemmatizer.lookup(None, text) + lemma_pred = nl_lemmatizer.lookup(text) assert lemma_pred in (lemma, text) diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py index 541e1b63e..02f25532a 100644 --- a/spacy/tests/vocab_vectors/test_lookups.py +++ b/spacy/tests/vocab_vectors/test_lookups.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import pytest -from spacy.lookups import Lookups +from spacy.lookups import Lookups, Table, ensure_hash from spacy.vocab import Vocab from ..util import make_tempdir @@ -19,9 +19,9 @@ def test_lookups_api(): table = lookups.get_table(table_name) assert table.name == table_name assert len(table) == 2 - assert table.get_string("hello") == "world" - table.set_string("a", "b") - assert table.get_string("a") == "b" + assert table["hello"] == "world" + table["a"] = "b" + assert table["a"] == "b" table = lookups.get_table(table_name) assert len(table) == 3 with pytest.raises(KeyError): @@ -36,6 +36,43 @@ def test_lookups_api(): lookups.get_table(table_name) +def test_table_api(): + table = Table(name="table") + assert table.name == "table" + assert len(table) == 0 + assert "abc" not in table + data = {"foo": "bar", "hello": "world"} + table = Table(name="table", data=data) + assert len(table) == len(data) + assert "foo" in table + assert ensure_hash("foo") in table + assert table["foo"] == "bar" + assert table[ensure_hash("foo")] == "bar" + assert table.get("foo") == "bar" + assert table.get("abc") is None + table["abc"] = 123 + assert table["abc"] == 123 + assert table[ensure_hash("abc")] == 123 + table.set("def", 456) + assert table["def"] == 456 + assert table[ensure_hash("def")] == 456 + + +def test_table_api_to_from_bytes(): + data = {"foo": "bar", "hello": "world", "abc": 123} + table = Table(name="table", data=data) + table_bytes = table.to_bytes() + new_table = Table().from_bytes(table_bytes) + assert new_table.name == "table" + assert len(new_table) == 3 + assert new_table["foo"] == "bar" + assert new_table[ensure_hash("foo")] == "bar" + new_table2 = Table(data={"def": 456}) + new_table2.from_bytes(table_bytes) + assert len(new_table2) == 3 + assert "def" not in new_table2 + + @pytest.mark.skip(reason="This fails on Python 3.5") def test_lookups_to_from_bytes(): lookups = Lookups() @@ -49,10 +86,10 @@ def test_lookups_to_from_bytes(): assert "table2" in new_lookups table1 = new_lookups.get_table("table1") assert len(table1) == 2 - assert table1.get_string("foo") == "bar" + assert table1["foo"] == "bar" table2 = new_lookups.get_table("table2") assert len(table2) == 3 - assert table2.get_string("b") == 2 + assert table2["b"] == 2 assert new_lookups.to_bytes() == lookups_bytes @@ -70,10 +107,10 @@ def test_lookups_to_from_disk(): assert "table2" in new_lookups table1 = new_lookups.get_table("table1") assert len(table1) == 2 - assert table1.get_string("foo") == "bar" + assert table1["foo"] == "bar" table2 = new_lookups.get_table("table2") assert len(table2) == 3 - assert table2.get_string("b") == 2 + assert table2["b"] == 2 @pytest.mark.skip(reason="This fails on Python 3.5") @@ -90,7 +127,7 @@ def test_lookups_to_from_bytes_via_vocab(): assert table_name in new_vocab.lookups table = new_vocab.lookups.get_table(table_name) assert len(table) == 2 - assert table.get_string("hello") == "world" + assert table["hello"] == "world" assert new_vocab.to_bytes() == vocab_bytes @@ -109,4 +146,4 @@ def test_lookups_to_from_disk_via_vocab(): assert table_name in new_vocab.lookups table = new_vocab.lookups.get_table(table_name) assert len(table) == 2 - assert table.get_string("hello") == "world" + assert table["hello"] == "world" diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index dfe42d2bd..e27b767a7 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -335,7 +335,7 @@ cdef class Token: """ def __get__(self): if self.c.lemma == 0: - lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth, self.orth_) + lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth) return self.vocab.strings[lemma_] else: return self.c.lemma @@ -862,7 +862,7 @@ cdef class Token: """ def __get__(self): if self.c.lemma == 0: - return self.vocab.morphology.lemmatizer.lookup(self.orth, self.orth_) + return self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth) else: return self.vocab.strings[self.c.lemma] diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 8b6d9dcf6..805e96b0f 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -52,8 +52,8 @@ Lemmatize a string. Look up a lemma in the lookup table, if available. If no lemma is found, the original string is returned. Languages can provide a -[lookup table](/usage/adding-languages#lemmatizer) via the `lemma_lookup` -variable, set on the individual `Language` class. +[lookup table](/usage/adding-languages#lemmatizer) via the `resources`, set on +the individual `Language` class. > #### Example > @@ -63,10 +63,11 @@ variable, set on the individual `Language` class. > assert lemmatizer.lookup("going") == "go" > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------------------------- | -| `string` | unicode | The string to look up. | -| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. | +| Name | Type | Description | +| ----------- | ------- | ----------------------------------------------------------------------------------------------------------- | +| `string` | unicode | The string to look up. | +| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. | +| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. | ## Lemmatizer.is_base_form {#is_base_form tag="method"} diff --git a/website/docs/api/lookups.md b/website/docs/api/lookups.md index ab65c4a0c..9878546ea 100644 --- a/website/docs/api/lookups.md +++ b/website/docs/api/lookups.md @@ -7,10 +7,11 @@ new: 2.2 --- This class allows convenient accesss to large lookup tables and dictionaries, -e.g. lemmatization data or tokenizer exception lists. Lookups are available via -the [`Vocab`](/api/vocab) as `vocab.lookups`, so they can be accessed before the -pipeline components are applied (e.g. in the tokenizer and lemmatizer), as well -as within the pipeline components via `doc.vocab.lookups`. +e.g. lemmatization data or tokenizer exception lists using Bloom filters. +Lookups are available via the [`Vocab`](/api/vocab) as `vocab.lookups`, so they +can be accessed before the pipeline components are applied (e.g. in the +tokenizer and lemmatizer), as well as within the pipeline components via +`doc.vocab.lookups`. ## Lookups.\_\_init\_\_ {#init tag="method"} @@ -215,8 +216,11 @@ the file doesn't exist. ## Table {#table tag="class, ordererddict"} A table in the lookups. Subclass of `OrderedDict` that implements a slightly -more consistent and unified API. Supports all other methods and attributes of -`OrderedDict` / `dict`, and the customized methods listed here. +more consistent and unified API and includes a Bloom filter to speed up missed +lookups. Supports **all other methods and attributes** of `OrderedDict` / +`dict`, and the customized methods listed here. Methods that get or set keys +accept both integers and strings (which will be hashed before being added to the +table). ### Table.\_\_init\_\_ {#table.init tag="method"} @@ -226,7 +230,10 @@ Initialize a new table. > > ```python > from spacy.lookups import Table -> table = Table(name="some_table") +> data = {"foo": "bar", "baz": 100} +> table = Table(name="some_table", data=data) +> assert "foo" in table +> assert table["foo"] == "bar" > ``` | Name | Type | Description | @@ -252,9 +259,10 @@ Initialize a new table from a dict. | `name` | unicode | Optional table name for reference. | | **RETURNS** | `Table` | The newly constructed object. | -### Table.set {#table.set tag="key"} +### Table.set {#table.set tag="method"} -Set a new key / value pair. Same as `table[key] = value`. +Set a new key / value pair. String keys will be hashed. Same as +`table[key] = value`. > #### Example > @@ -265,7 +273,46 @@ Set a new key / value pair. Same as `table[key] = value`. > assert table["foo"] == "bar" > ``` -| Name | Type | Description | -| ------- | ------- | ----------- | -| `key` | unicode | The key. | -| `value` | - | The value. | +| Name | Type | Description | +| ------- | ------------- | ----------- | +| `key` | unicode / int | The key. | +| `value` | - | The value. | + +### Table.to_bytes {#table.to_bytes tag="method"} + +Serialize the table to a bytestring. + +> #### Example +> +> ```python +> table_bytes = table.to_bytes() +> ``` + +| Name | Type | Description | +| ----------- | ----- | --------------------- | +| **RETURNS** | bytes | The serialized table. | + +### Table.from_bytes {#table.from_bytes tag="method"} + +Load a table from a bytestring. + +> #### Example +> +> ```python +> table_bytes = table.to_bytes() +> table = Table() +> table.from_bytes(table_bytes) +> ``` + +| Name | Type | Description | +| ------------ | ------- | ----------------- | +| `bytes_data` | bytes | The data to load. | +| **RETURNS** | `Table` | The loaded table. | + +### Attributes {#table-attributes} + +| Name | Type | Description | +| -------------- | --------------------------- | ----------------------------------------------------- | +| `name` | unicode | Table name. | +| `default_size` | int | Default size of bloom filters if no data is provided. | +| `bloom` | `preshed.bloom.BloomFilter` | The bloom filters. |