mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
💫 Adjust Table API and add docs (#4289)
* Adjust Table API and add docs * Add attributes and update description [ci skip] * Use strings.get_string_id instead of hash_string * Fix table method calls * Make orth arg in Lemmatizer.lookup optional Fall back to string, which is now handled by Table.__contains__ out-of-the-box * Fix method name * Auto-format
This commit is contained in:
parent
88a9d87f6f
commit
bab9976d9a
|
@ -46,9 +46,10 @@ class GreekLemmatizer(object):
|
||||||
)
|
)
|
||||||
return lemmas
|
return lemmas
|
||||||
|
|
||||||
def lookup(self, orth, string):
|
def lookup(self, string, orth=None):
|
||||||
if orth in self.lookup_table:
|
key = orth if orth is not None else string
|
||||||
return self.lookup_table[orth]
|
if key in self.lookup_table:
|
||||||
|
return self.lookup_table[key]
|
||||||
return string
|
return string
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -52,7 +52,7 @@ class FrenchLemmatizer(object):
|
||||||
elif univ_pos in (SCONJ, "SCONJ", "sconj"):
|
elif univ_pos in (SCONJ, "SCONJ", "sconj"):
|
||||||
univ_pos = "sconj"
|
univ_pos = "sconj"
|
||||||
else:
|
else:
|
||||||
return [self.lookup(None, string)]
|
return [self.lookup(string)]
|
||||||
# See Issue #435 for example of where this logic is requied.
|
# See Issue #435 for example of where this logic is requied.
|
||||||
if self.is_base_form(univ_pos, morphology):
|
if self.is_base_form(univ_pos, morphology):
|
||||||
return list(set([string.lower()]))
|
return list(set([string.lower()]))
|
||||||
|
@ -114,7 +114,7 @@ class FrenchLemmatizer(object):
|
||||||
def punct(self, string, morphology=None):
|
def punct(self, string, morphology=None):
|
||||||
return self(string, "punct", morphology)
|
return self(string, "punct", morphology)
|
||||||
|
|
||||||
def lookup(self, orth, string):
|
def lookup(self, string, orth=None):
|
||||||
if orth is not None and orth in self.lookup_table:
|
if orth is not None and orth in self.lookup_table:
|
||||||
return self.lookup_table[orth][0]
|
return self.lookup_table[orth][0]
|
||||||
return string
|
return string
|
||||||
|
|
|
@ -62,11 +62,11 @@ class DutchLemmatizer(object):
|
||||||
# are not lemmatized. They are lowercased, however.
|
# are not lemmatized. They are lowercased, however.
|
||||||
return [string]
|
return [string]
|
||||||
# if string in self.lemma_index.get(univ_pos)
|
# if string in self.lemma_index.get(univ_pos)
|
||||||
lemma_index = self.index.get_string(univ_pos, {})
|
lemma_index = self.index.get(univ_pos, {})
|
||||||
# string is already lemma
|
# string is already lemma
|
||||||
if string in lemma_index:
|
if string in lemma_index:
|
||||||
return [string]
|
return [string]
|
||||||
exceptions = self.exc.get_string(univ_pos, {})
|
exceptions = self.exc.get(univ_pos, {})
|
||||||
# string is irregular token contained in exceptions index.
|
# string is irregular token contained in exceptions index.
|
||||||
try:
|
try:
|
||||||
lemma = exceptions[string]
|
lemma = exceptions[string]
|
||||||
|
@ -75,12 +75,12 @@ class DutchLemmatizer(object):
|
||||||
pass
|
pass
|
||||||
# string corresponds to key in lookup table
|
# string corresponds to key in lookup table
|
||||||
lookup_table = self.lookup_table
|
lookup_table = self.lookup_table
|
||||||
looked_up_lemma = lookup_table.get_string(string)
|
looked_up_lemma = lookup_table.get(string)
|
||||||
if looked_up_lemma and looked_up_lemma in lemma_index:
|
if looked_up_lemma and looked_up_lemma in lemma_index:
|
||||||
return [looked_up_lemma]
|
return [looked_up_lemma]
|
||||||
|
|
||||||
forms, is_known = lemmatize(
|
forms, is_known = lemmatize(
|
||||||
string, lemma_index, exceptions, self.rules.get_string(univ_pos, [])
|
string, lemma_index, exceptions, self.rules.get(univ_pos, [])
|
||||||
)
|
)
|
||||||
|
|
||||||
# Back-off through remaining return value candidates.
|
# Back-off through remaining return value candidates.
|
||||||
|
@ -103,12 +103,12 @@ class DutchLemmatizer(object):
|
||||||
# Overrides parent method so that a lowercased version of the string is
|
# Overrides parent method so that a lowercased version of the string is
|
||||||
# used to search the lookup table. This is necessary because our lookup
|
# used to search the lookup table. This is necessary because our lookup
|
||||||
# table consists entirely of lowercase keys.
|
# table consists entirely of lowercase keys.
|
||||||
def lookup(self, orth, string):
|
def lookup(self, string, orth=None):
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
if orth is not None:
|
if orth is not None:
|
||||||
return self.lookup_table.get(orth, string)
|
return self.lookup_table.get(orth, string)
|
||||||
else:
|
else:
|
||||||
return self.lookup_table.get_string(string, string)
|
return self.lookup_table.get(string, string)
|
||||||
|
|
||||||
def noun(self, string, morphology=None):
|
def noun(self, string, morphology=None):
|
||||||
return self(string, "noun", morphology)
|
return self(string, "noun", morphology)
|
||||||
|
|
|
@ -115,7 +115,7 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
def pron(self, string, morphology=None):
|
def pron(self, string, morphology=None):
|
||||||
return self(string, "pron", morphology)
|
return self(string, "pron", morphology)
|
||||||
|
|
||||||
def lookup(self, orth, string):
|
def lookup(self, string, orth=None):
|
||||||
analyses = self._morph.parse(string)
|
analyses = self._morph.parse(string)
|
||||||
if len(analyses) == 1:
|
if len(analyses) == 1:
|
||||||
return analyses[0].normal_form
|
return analyses[0].normal_form
|
||||||
|
|
|
@ -112,7 +112,7 @@ class UkrainianLemmatizer(Lemmatizer):
|
||||||
def pron(self, string, morphology=None):
|
def pron(self, string, morphology=None):
|
||||||
return self(string, "pron", morphology)
|
return self(string, "pron", morphology)
|
||||||
|
|
||||||
def lookup(self, orth, string):
|
def lookup(self, string, orth=None):
|
||||||
analyses = self._morph.parse(string)
|
analyses = self._morph.parse(string)
|
||||||
if len(analyses) == 1:
|
if len(analyses) == 1:
|
||||||
return analyses[0].normal_form
|
return analyses[0].normal_form
|
||||||
|
|
|
@ -93,9 +93,19 @@ class Lemmatizer(object):
|
||||||
def punct(self, string, morphology=None):
|
def punct(self, string, morphology=None):
|
||||||
return self(string, "punct", morphology)
|
return self(string, "punct", morphology)
|
||||||
|
|
||||||
def lookup(self, orth, string):
|
def lookup(self, string, orth=None):
|
||||||
if orth in self.lookup_table:
|
"""Look up a lemma in the table, if available. If no lemma is found,
|
||||||
return self.lookup_table[orth]
|
the original string is returned.
|
||||||
|
|
||||||
|
string (unicode): The original string.
|
||||||
|
orth (int): Optional hash of the string to look up. If not set, the
|
||||||
|
string will be used and hashed.
|
||||||
|
RETURNS (unicode): The lemma if the string was found, otherwise the
|
||||||
|
original string.
|
||||||
|
"""
|
||||||
|
key = orth if orth is not None else string
|
||||||
|
if key in self.lookup_table:
|
||||||
|
return self.lookup_table[key]
|
||||||
return string
|
return string
|
||||||
|
|
||||||
|
|
||||||
|
|
113
spacy/lookups.py
113
spacy/lookups.py
|
@ -3,15 +3,19 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
from preshed.bloom import BloomFilter
|
||||||
|
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from .util import SimpleFrozenDict, ensure_path
|
from .util import SimpleFrozenDict, ensure_path
|
||||||
from .strings import hash_string
|
from .compat import basestring_
|
||||||
|
from .strings import get_string_id
|
||||||
|
|
||||||
from . import util
|
|
||||||
|
|
||||||
import srsly
|
def ensure_hash(key):
|
||||||
from preshed.bloom import BloomFilter
|
if isinstance(key, basestring_):
|
||||||
|
return get_string_id(key)
|
||||||
|
return key
|
||||||
|
|
||||||
|
|
||||||
class Lookups(object):
|
class Lookups(object):
|
||||||
"""Container for large lookup tables and dictionaries, e.g. lemmatization
|
"""Container for large lookup tables and dictionaries, e.g. lemmatization
|
||||||
|
@ -102,7 +106,7 @@ class Lookups(object):
|
||||||
"""
|
"""
|
||||||
for key, value in srsly.msgpack_loads(bytes_data).items():
|
for key, value in srsly.msgpack_loads(bytes_data).items():
|
||||||
self._tables[key] = Table(key)
|
self._tables[key] = Table(key)
|
||||||
self._tables[key].update_raw(value)
|
self._tables[key].update(value)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, **kwargs):
|
def to_disk(self, path, **kwargs):
|
||||||
|
@ -163,72 +167,85 @@ class Table(OrderedDict):
|
||||||
"""
|
"""
|
||||||
OrderedDict.__init__(self)
|
OrderedDict.__init__(self)
|
||||||
self.name = name
|
self.name = name
|
||||||
# assume a default size of 1M items
|
# Assume a default size of 1M items
|
||||||
size = 1E6
|
self.default_size = 1e6
|
||||||
if data and len(data) > 0:
|
size = len(data) if data and len(data) > 0 else self.default_size
|
||||||
size = len(data)
|
|
||||||
|
|
||||||
self.bloom = BloomFilter.from_error_rate(size)
|
self.bloom = BloomFilter.from_error_rate(size)
|
||||||
|
|
||||||
if data:
|
if data:
|
||||||
self.update(data)
|
self.update(data)
|
||||||
|
|
||||||
def set(self, key, value):
|
|
||||||
"""Set new key/value pair, where key is an integer. Same as
|
|
||||||
table[key] = value.
|
|
||||||
"""
|
|
||||||
self[key] = value
|
|
||||||
|
|
||||||
def __setitem__(self, key, value):
|
def __setitem__(self, key, value):
|
||||||
|
"""Set new key/value pair. String keys will be hashed.
|
||||||
|
|
||||||
|
key (unicode / int): The key to set.
|
||||||
|
value: The value to set.
|
||||||
|
"""
|
||||||
|
key = ensure_hash(key)
|
||||||
OrderedDict.__setitem__(self, key, value)
|
OrderedDict.__setitem__(self, key, value)
|
||||||
self.bloom.add(key)
|
self.bloom.add(key)
|
||||||
|
|
||||||
def set_string(self, key, value):
|
def set(self, key, value):
|
||||||
"""Set new key/value pair, where key is a string to be hashed.
|
"""Set new key/value pair. String keys will be hashed.
|
||||||
"""
|
Same as table[key] = value.
|
||||||
hkey = hash_string(key)
|
|
||||||
self.set(hkey, value)
|
|
||||||
|
|
||||||
def update(self, data):
|
key (unicode / int): The key to set.
|
||||||
"""Add entries in a dict-like to the table, where keys are strings to
|
value: The value to set.
|
||||||
be hashed.
|
|
||||||
"""
|
"""
|
||||||
for key, val in data.items():
|
self[key] = value
|
||||||
self.set_string(key, val)
|
|
||||||
|
|
||||||
def update_raw(self, data):
|
def __getitem__(self, key):
|
||||||
"""Add entries in a dict-like to the table, where keys are ints.
|
"""Get the value for a given key. String keys will be hashed.
|
||||||
|
|
||||||
|
key (unicode / int): The key to get.
|
||||||
|
RETURNS: The value.
|
||||||
"""
|
"""
|
||||||
for key, val in data.items():
|
key = ensure_hash(key)
|
||||||
self.set(key, val)
|
return OrderedDict.__getitem__(self, key)
|
||||||
|
|
||||||
def get(self, key, default=None):
|
def get(self, key, default=None):
|
||||||
|
"""Get the value for a given key. String keys will be hashed.
|
||||||
|
|
||||||
|
key (unicode / int): The key to get.
|
||||||
|
default: The default value to return.
|
||||||
|
RETURNS: The value.
|
||||||
|
"""
|
||||||
|
key = ensure_hash(key)
|
||||||
return OrderedDict.get(self, key, default)
|
return OrderedDict.get(self, key, default)
|
||||||
|
|
||||||
def get_string(self, key, default=None):
|
|
||||||
hkey = hash_string(key)
|
|
||||||
return OrderedDict.get(self, hkey, default)
|
|
||||||
|
|
||||||
def __contains__(self, key):
|
def __contains__(self, key):
|
||||||
|
"""Check whether a key is in the table. String keys will be hashed.
|
||||||
|
|
||||||
|
key (unicode / int): The key to check.
|
||||||
|
RETURNS (bool): Whether the key is in the table.
|
||||||
|
"""
|
||||||
|
key = ensure_hash(key)
|
||||||
# This can give a false positive, so we need to check it after
|
# This can give a false positive, so we need to check it after
|
||||||
if key not in self.bloom:
|
if key not in self.bloom:
|
||||||
return False
|
return False
|
||||||
return OrderedDict.__contains__(self, key)
|
return OrderedDict.__contains__(self, key)
|
||||||
|
|
||||||
def contains_string(self, key):
|
|
||||||
hkey = hash_string(key)
|
|
||||||
return self.__contains__(hkey)
|
|
||||||
|
|
||||||
def to_bytes(self):
|
def to_bytes(self):
|
||||||
# TODO: serialize bloom too. For now just reconstruct it.
|
"""Serialize table to a bytestring.
|
||||||
return srsly.msgpack_dumps({'name': self.name, 'dict': dict(self.items())})
|
|
||||||
|
|
||||||
def from_bytes(self, data):
|
RETURNS (bytes): The serialized table.
|
||||||
loaded = srsly.msgpack_loads(data)
|
"""
|
||||||
self.name = loaded['name']
|
data = [
|
||||||
for key, val in loaded['dict'].items():
|
("name", self.name),
|
||||||
self[key] = val
|
("dict", dict(self.items())),
|
||||||
self.bloom.add(key)
|
("bloom", self.bloom.to_bytes()),
|
||||||
|
]
|
||||||
|
return srsly.msgpack_dumps(OrderedDict(data))
|
||||||
|
|
||||||
|
def from_bytes(self, bytes_data):
|
||||||
|
"""Load a table from a bytestring.
|
||||||
|
|
||||||
|
bytes_data (bytes): The data to load.
|
||||||
|
RETURNS (Table): The loaded table.
|
||||||
|
"""
|
||||||
|
loaded = srsly.msgpack_loads(bytes_data)
|
||||||
|
data = loaded.get("dict", {})
|
||||||
|
self.name = loaded["name"]
|
||||||
|
self.bloom = BloomFilter().from_bytes(loaded["bloom"])
|
||||||
|
self.clear()
|
||||||
|
self.update(data)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
|
@ -273,7 +273,7 @@ cdef class Morphology:
|
||||||
"""
|
"""
|
||||||
if token.lemma == 0:
|
if token.lemma == 0:
|
||||||
orth_str = self.strings[token.lex.orth]
|
orth_str = self.strings[token.lex.orth]
|
||||||
lemma = self.lemmatizer.lookup(token.lex.orth, orth_str)
|
lemma = self.lemmatizer.lookup(orth_str, orth=token.lex.orth)
|
||||||
token.lemma = self.strings.add(lemma)
|
token.lemma = self.strings.add(lemma)
|
||||||
|
|
||||||
cdef int assign_tag(self, TokenC* token, tag_str) except -1:
|
cdef int assign_tag(self, TokenC* token, tag_str) except -1:
|
||||||
|
|
|
@ -17,6 +17,4 @@ TEST_CASES = [
|
||||||
|
|
||||||
@pytest.mark.parametrize("tokens,lemmas", TEST_CASES)
|
@pytest.mark.parametrize("tokens,lemmas", TEST_CASES)
|
||||||
def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas):
|
def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas):
|
||||||
assert lemmas == [
|
assert lemmas == [lt_lemmatizer.lookup_table.get(token, token) for token in tokens]
|
||||||
lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens
|
|
||||||
]
|
|
||||||
|
|
|
@ -133,11 +133,11 @@ def test_nl_lemmatizer_pronoun_lemmas(nl_lemmatizer, text, lemma):
|
||||||
# Using the lemma lookup table only
|
# Using the lemma lookup table only
|
||||||
@pytest.mark.parametrize("text,lemma", noun_irreg_lemmatization_cases)
|
@pytest.mark.parametrize("text,lemma", noun_irreg_lemmatization_cases)
|
||||||
def test_nl_lemmatizer_lookup_noun(nl_lemmatizer, text, lemma):
|
def test_nl_lemmatizer_lookup_noun(nl_lemmatizer, text, lemma):
|
||||||
lemma_pred = nl_lemmatizer.lookup(None, text)
|
lemma_pred = nl_lemmatizer.lookup(text)
|
||||||
assert lemma_pred in (lemma, text)
|
assert lemma_pred in (lemma, text)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,lemma", verb_irreg_lemmatization_cases)
|
@pytest.mark.parametrize("text,lemma", verb_irreg_lemmatization_cases)
|
||||||
def test_nl_lemmatizer_lookup_verb(nl_lemmatizer, text, lemma):
|
def test_nl_lemmatizer_lookup_verb(nl_lemmatizer, text, lemma):
|
||||||
lemma_pred = nl_lemmatizer.lookup(None, text)
|
lemma_pred = nl_lemmatizer.lookup(text)
|
||||||
assert lemma_pred in (lemma, text)
|
assert lemma_pred in (lemma, text)
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.lookups import Lookups
|
from spacy.lookups import Lookups, Table, ensure_hash
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
@ -19,9 +19,9 @@ def test_lookups_api():
|
||||||
table = lookups.get_table(table_name)
|
table = lookups.get_table(table_name)
|
||||||
assert table.name == table_name
|
assert table.name == table_name
|
||||||
assert len(table) == 2
|
assert len(table) == 2
|
||||||
assert table.get_string("hello") == "world"
|
assert table["hello"] == "world"
|
||||||
table.set_string("a", "b")
|
table["a"] = "b"
|
||||||
assert table.get_string("a") == "b"
|
assert table["a"] == "b"
|
||||||
table = lookups.get_table(table_name)
|
table = lookups.get_table(table_name)
|
||||||
assert len(table) == 3
|
assert len(table) == 3
|
||||||
with pytest.raises(KeyError):
|
with pytest.raises(KeyError):
|
||||||
|
@ -36,6 +36,43 @@ def test_lookups_api():
|
||||||
lookups.get_table(table_name)
|
lookups.get_table(table_name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_table_api():
|
||||||
|
table = Table(name="table")
|
||||||
|
assert table.name == "table"
|
||||||
|
assert len(table) == 0
|
||||||
|
assert "abc" not in table
|
||||||
|
data = {"foo": "bar", "hello": "world"}
|
||||||
|
table = Table(name="table", data=data)
|
||||||
|
assert len(table) == len(data)
|
||||||
|
assert "foo" in table
|
||||||
|
assert ensure_hash("foo") in table
|
||||||
|
assert table["foo"] == "bar"
|
||||||
|
assert table[ensure_hash("foo")] == "bar"
|
||||||
|
assert table.get("foo") == "bar"
|
||||||
|
assert table.get("abc") is None
|
||||||
|
table["abc"] = 123
|
||||||
|
assert table["abc"] == 123
|
||||||
|
assert table[ensure_hash("abc")] == 123
|
||||||
|
table.set("def", 456)
|
||||||
|
assert table["def"] == 456
|
||||||
|
assert table[ensure_hash("def")] == 456
|
||||||
|
|
||||||
|
|
||||||
|
def test_table_api_to_from_bytes():
|
||||||
|
data = {"foo": "bar", "hello": "world", "abc": 123}
|
||||||
|
table = Table(name="table", data=data)
|
||||||
|
table_bytes = table.to_bytes()
|
||||||
|
new_table = Table().from_bytes(table_bytes)
|
||||||
|
assert new_table.name == "table"
|
||||||
|
assert len(new_table) == 3
|
||||||
|
assert new_table["foo"] == "bar"
|
||||||
|
assert new_table[ensure_hash("foo")] == "bar"
|
||||||
|
new_table2 = Table(data={"def": 456})
|
||||||
|
new_table2.from_bytes(table_bytes)
|
||||||
|
assert len(new_table2) == 3
|
||||||
|
assert "def" not in new_table2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="This fails on Python 3.5")
|
@pytest.mark.skip(reason="This fails on Python 3.5")
|
||||||
def test_lookups_to_from_bytes():
|
def test_lookups_to_from_bytes():
|
||||||
lookups = Lookups()
|
lookups = Lookups()
|
||||||
|
@ -49,10 +86,10 @@ def test_lookups_to_from_bytes():
|
||||||
assert "table2" in new_lookups
|
assert "table2" in new_lookups
|
||||||
table1 = new_lookups.get_table("table1")
|
table1 = new_lookups.get_table("table1")
|
||||||
assert len(table1) == 2
|
assert len(table1) == 2
|
||||||
assert table1.get_string("foo") == "bar"
|
assert table1["foo"] == "bar"
|
||||||
table2 = new_lookups.get_table("table2")
|
table2 = new_lookups.get_table("table2")
|
||||||
assert len(table2) == 3
|
assert len(table2) == 3
|
||||||
assert table2.get_string("b") == 2
|
assert table2["b"] == 2
|
||||||
assert new_lookups.to_bytes() == lookups_bytes
|
assert new_lookups.to_bytes() == lookups_bytes
|
||||||
|
|
||||||
|
|
||||||
|
@ -70,10 +107,10 @@ def test_lookups_to_from_disk():
|
||||||
assert "table2" in new_lookups
|
assert "table2" in new_lookups
|
||||||
table1 = new_lookups.get_table("table1")
|
table1 = new_lookups.get_table("table1")
|
||||||
assert len(table1) == 2
|
assert len(table1) == 2
|
||||||
assert table1.get_string("foo") == "bar"
|
assert table1["foo"] == "bar"
|
||||||
table2 = new_lookups.get_table("table2")
|
table2 = new_lookups.get_table("table2")
|
||||||
assert len(table2) == 3
|
assert len(table2) == 3
|
||||||
assert table2.get_string("b") == 2
|
assert table2["b"] == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="This fails on Python 3.5")
|
@pytest.mark.skip(reason="This fails on Python 3.5")
|
||||||
|
@ -90,7 +127,7 @@ def test_lookups_to_from_bytes_via_vocab():
|
||||||
assert table_name in new_vocab.lookups
|
assert table_name in new_vocab.lookups
|
||||||
table = new_vocab.lookups.get_table(table_name)
|
table = new_vocab.lookups.get_table(table_name)
|
||||||
assert len(table) == 2
|
assert len(table) == 2
|
||||||
assert table.get_string("hello") == "world"
|
assert table["hello"] == "world"
|
||||||
assert new_vocab.to_bytes() == vocab_bytes
|
assert new_vocab.to_bytes() == vocab_bytes
|
||||||
|
|
||||||
|
|
||||||
|
@ -109,4 +146,4 @@ def test_lookups_to_from_disk_via_vocab():
|
||||||
assert table_name in new_vocab.lookups
|
assert table_name in new_vocab.lookups
|
||||||
table = new_vocab.lookups.get_table(table_name)
|
table = new_vocab.lookups.get_table(table_name)
|
||||||
assert len(table) == 2
|
assert len(table) == 2
|
||||||
assert table.get_string("hello") == "world"
|
assert table["hello"] == "world"
|
||||||
|
|
|
@ -335,7 +335,7 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if self.c.lemma == 0:
|
if self.c.lemma == 0:
|
||||||
lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth, self.orth_)
|
lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth)
|
||||||
return self.vocab.strings[lemma_]
|
return self.vocab.strings[lemma_]
|
||||||
else:
|
else:
|
||||||
return self.c.lemma
|
return self.c.lemma
|
||||||
|
@ -862,7 +862,7 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if self.c.lemma == 0:
|
if self.c.lemma == 0:
|
||||||
return self.vocab.morphology.lemmatizer.lookup(self.orth, self.orth_)
|
return self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth)
|
||||||
else:
|
else:
|
||||||
return self.vocab.strings[self.c.lemma]
|
return self.vocab.strings[self.c.lemma]
|
||||||
|
|
||||||
|
|
|
@ -52,8 +52,8 @@ Lemmatize a string.
|
||||||
|
|
||||||
Look up a lemma in the lookup table, if available. If no lemma is found, the
|
Look up a lemma in the lookup table, if available. If no lemma is found, the
|
||||||
original string is returned. Languages can provide a
|
original string is returned. Languages can provide a
|
||||||
[lookup table](/usage/adding-languages#lemmatizer) via the `lemma_lookup`
|
[lookup table](/usage/adding-languages#lemmatizer) via the `resources`, set on
|
||||||
variable, set on the individual `Language` class.
|
the individual `Language` class.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -64,8 +64,9 @@ variable, set on the individual `Language` class.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ------- | ----------------------------------------------------------------- |
|
| ----------- | ------- | ----------------------------------------------------------------------------------------------------------- |
|
||||||
| `string` | unicode | The string to look up. |
|
| `string` | unicode | The string to look up. |
|
||||||
|
| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. |
|
||||||
| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. |
|
| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. |
|
||||||
|
|
||||||
## Lemmatizer.is_base_form {#is_base_form tag="method"}
|
## Lemmatizer.is_base_form {#is_base_form tag="method"}
|
||||||
|
|
|
@ -7,10 +7,11 @@ new: 2.2
|
||||||
---
|
---
|
||||||
|
|
||||||
This class allows convenient accesss to large lookup tables and dictionaries,
|
This class allows convenient accesss to large lookup tables and dictionaries,
|
||||||
e.g. lemmatization data or tokenizer exception lists. Lookups are available via
|
e.g. lemmatization data or tokenizer exception lists using Bloom filters.
|
||||||
the [`Vocab`](/api/vocab) as `vocab.lookups`, so they can be accessed before the
|
Lookups are available via the [`Vocab`](/api/vocab) as `vocab.lookups`, so they
|
||||||
pipeline components are applied (e.g. in the tokenizer and lemmatizer), as well
|
can be accessed before the pipeline components are applied (e.g. in the
|
||||||
as within the pipeline components via `doc.vocab.lookups`.
|
tokenizer and lemmatizer), as well as within the pipeline components via
|
||||||
|
`doc.vocab.lookups`.
|
||||||
|
|
||||||
## Lookups.\_\_init\_\_ {#init tag="method"}
|
## Lookups.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
@ -215,8 +216,11 @@ the file doesn't exist.
|
||||||
## Table {#table tag="class, ordererddict"}
|
## Table {#table tag="class, ordererddict"}
|
||||||
|
|
||||||
A table in the lookups. Subclass of `OrderedDict` that implements a slightly
|
A table in the lookups. Subclass of `OrderedDict` that implements a slightly
|
||||||
more consistent and unified API. Supports all other methods and attributes of
|
more consistent and unified API and includes a Bloom filter to speed up missed
|
||||||
`OrderedDict` / `dict`, and the customized methods listed here.
|
lookups. Supports **all other methods and attributes** of `OrderedDict` /
|
||||||
|
`dict`, and the customized methods listed here. Methods that get or set keys
|
||||||
|
accept both integers and strings (which will be hashed before being added to the
|
||||||
|
table).
|
||||||
|
|
||||||
### Table.\_\_init\_\_ {#table.init tag="method"}
|
### Table.\_\_init\_\_ {#table.init tag="method"}
|
||||||
|
|
||||||
|
@ -226,7 +230,10 @@ Initialize a new table.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.lookups import Table
|
> from spacy.lookups import Table
|
||||||
> table = Table(name="some_table")
|
> data = {"foo": "bar", "baz": 100}
|
||||||
|
> table = Table(name="some_table", data=data)
|
||||||
|
> assert "foo" in table
|
||||||
|
> assert table["foo"] == "bar"
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
|
@ -252,9 +259,10 @@ Initialize a new table from a dict.
|
||||||
| `name` | unicode | Optional table name for reference. |
|
| `name` | unicode | Optional table name for reference. |
|
||||||
| **RETURNS** | `Table` | The newly constructed object. |
|
| **RETURNS** | `Table` | The newly constructed object. |
|
||||||
|
|
||||||
### Table.set {#table.set tag="key"}
|
### Table.set {#table.set tag="method"}
|
||||||
|
|
||||||
Set a new key / value pair. Same as `table[key] = value`.
|
Set a new key / value pair. String keys will be hashed. Same as
|
||||||
|
`table[key] = value`.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -266,6 +274,45 @@ Set a new key / value pair. Same as `table[key] = value`.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------- | ------- | ----------- |
|
| ------- | ------------- | ----------- |
|
||||||
| `key` | unicode | The key. |
|
| `key` | unicode / int | The key. |
|
||||||
| `value` | - | The value. |
|
| `value` | - | The value. |
|
||||||
|
|
||||||
|
### Table.to_bytes {#table.to_bytes tag="method"}
|
||||||
|
|
||||||
|
Serialize the table to a bytestring.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> table_bytes = table.to_bytes()
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ----- | --------------------- |
|
||||||
|
| **RETURNS** | bytes | The serialized table. |
|
||||||
|
|
||||||
|
### Table.from_bytes {#table.from_bytes tag="method"}
|
||||||
|
|
||||||
|
Load a table from a bytestring.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> table_bytes = table.to_bytes()
|
||||||
|
> table = Table()
|
||||||
|
> table.from_bytes(table_bytes)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ------------ | ------- | ----------------- |
|
||||||
|
| `bytes_data` | bytes | The data to load. |
|
||||||
|
| **RETURNS** | `Table` | The loaded table. |
|
||||||
|
|
||||||
|
### Attributes {#table-attributes}
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------- | --------------------------- | ----------------------------------------------------- |
|
||||||
|
| `name` | unicode | Table name. |
|
||||||
|
| `default_size` | int | Default size of bloom filters if no data is provided. |
|
||||||
|
| `bloom` | `preshed.bloom.BloomFilter` | The bloom filters. |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user