diff --git a/spacy/lookups.py b/spacy/lookups.py index e639009df..a6fa7abff 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -32,7 +32,7 @@ class Lookups(object): Lookups.has_table. name (unicode): Name of the table. - RETURNS (bool): Whether a table of that name exists. + RETURNS (bool): Whether a table of that name is in the lookups. """ return self.has_table(name) @@ -72,7 +72,7 @@ class Lookups(object): def remove_table(self, name): """Remove a table. Raises an error if the table doesn't exist. - name (unicode): The name to remove. + name (unicode): Name of the table to remove. RETURNS (Table): The removed table. """ if name not in self._tables: @@ -87,19 +87,18 @@ class Lookups(object): """ return name in self._tables - def to_bytes(self, exclude=tuple(), **kwargs): + def to_bytes(self, **kwargs): """Serialize the lookups to a bytestring. - exclude (list): String names of serialization fields to exclude. RETURNS (bytes): The serialized Lookups. """ return srsly.msgpack_dumps(self._tables) - def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def from_bytes(self, bytes_data, **kwargs): """Load the lookups from a bytestring. - exclude (list): String names of serialization fields to exclude. - RETURNS (bytes): The loaded Lookups. + bytes_data (bytes): The data to load. + RETURNS (Lookups): The loaded Lookups. """ self._tables = OrderedDict() msg = srsly.msgpack_loads(bytes_data) @@ -108,7 +107,8 @@ class Lookups(object): return self def to_disk(self, path, **kwargs): - """Save the lookups to a directory as lookups.bin. + """Save the lookups to a directory as lookups.bin. Expects a path to a + directory, which will be created if it doesn't exist. path (unicode / Path): The file path. """ @@ -121,9 +121,10 @@ class Lookups(object): file_.write(self.to_bytes()) def from_disk(self, path, **kwargs): - """Load lookups from a directory containing a lookups.bin. + """Load lookups from a directory containing a lookups.bin. Will skip + loading if the file doesn't exist. - path (unicode / Path): The file path. + path (unicode / Path): The directory path. RETURNS (Lookups): The loaded lookups. """ path = ensure_path(path) @@ -136,12 +137,18 @@ class Lookups(object): class Table(OrderedDict): - """A table in the lookups. Subclass of builtin dict that implements a + """A table in the lookups. Subclass of OrderedDict that implements a slightly more consistent and unified API. """ @classmethod def from_dict(cls, data, name=None): + """Initialize a new table from a dict. + + data (dict): The dictionary. + name (unicode): Optional table name for reference. + RETURNS (Table): The newly created object. + """ self = cls(name=name) self.update(data) return self diff --git a/website/docs/api/lookups.md b/website/docs/api/lookups.md new file mode 100644 index 000000000..ab65c4a0c --- /dev/null +++ b/website/docs/api/lookups.md @@ -0,0 +1,271 @@ +--- +title: Lookups +teaser: A container for large lookup tables and dictionaries +tag: class +source: spacy/lookups.py +new: 2.2 +--- + +This class allows convenient accesss to large lookup tables and dictionaries, +e.g. lemmatization data or tokenizer exception lists. Lookups are available via +the [`Vocab`](/api/vocab) as `vocab.lookups`, so they can be accessed before the +pipeline components are applied (e.g. in the tokenizer and lemmatizer), as well +as within the pipeline components via `doc.vocab.lookups`. + +## Lookups.\_\_init\_\_ {#init tag="method"} + +Create a `Lookups` object. + +> #### Example +> +> ```python +> from spacy.lookups import Lookups +> lookups = Lookups() +> ``` + +| Name | Type | Description | +| ----------- | --------- | ----------------------------- | +| **RETURNS** | `Lookups` | The newly constructed object. | + +## Lookups.\_\_len\_\_ {#len tag="method"} + +Get the current number of tables in the lookups. + +> #### Example +> +> ```python +> lookups = Lookups() +> assert len(lookups) == 0 +> ``` + +| Name | Type | Description | +| ----------- | ---- | ------------------------------------ | +| **RETURNS** | int | The number of tables in the lookups. | + +## Lookups.\_\contains\_\_ {#contains tag="method"} + +Check if the lookups contain a table of a given name. Delegates to +[`Lookups.has_table`](/api/lookups#has_table). + +> #### Example +> +> ```python +> lookups = Lookups() +> lookups.add_table("some_table") +> assert "some_table" in lookups +> ``` + +| Name | Type | Description | +| ----------- | ------- | ----------------------------------------------- | +| `name` | unicode | Name of the table. | +| **RETURNS** | bool | Whether a table of that name is in the lookups. | + +## Lookups.tables {#tables tag="property"} + +Get the names of all tables in the lookups. + +> #### Example +> +> ```python +> lookups = Lookups() +> lookups.add_table("some_table") +> assert lookups.tables == ["some_table"] +> ``` + +| Name | Type | Description | +| ----------- | ---- | ----------------------------------- | +| **RETURNS** | list | Names of the tables in the lookups. | + +## Lookups.add_table {#add_table tag="method"} + +Add a new table with optional data to the lookups. Raises an error if the table +exists. + +> #### Example +> +> ```python +> lookups = Lookups() +> lookups.add_table("some_table", {"foo": "bar"}) +> ``` + +| Name | Type | Description | +| ----------- | ----------------------------- | ---------------------------------- | +| `name` | unicode | Unique name of the table. | +| `data` | dict | Optional data to add to the table. | +| **RETURNS** | [`Table`](/api/lookups#table) | The newly added table. | + +## Lookups.get_table {#get_table tag="method"} + +Get a table from the lookups. Raises an error if the table doesn't exist. + +> #### Example +> +> ```python +> lookups = Lookups() +> lookups.add_table("some_table", {"foo": "bar"}) +> table = lookups.get_table("some_table") +> assert table["foo"] == "bar" +> ``` + +| Name | Type | Description | +| ----------- | ----------------------------- | ------------------ | +| `name` | unicode | Name of the table. | +| **RETURNS** | [`Table`](/api/lookups#table) | The table. | + +## Lookups.remove_table {#remove_table tag="method"} + +Remove a table from the lookups. Raises an error if the table doesn't exist. + +> #### Example +> +> ```python +> lookups = Lookups() +> lookups.add_table("some_table") +> removed_table = lookups.remove_table("some_table") +> assert "some_table" not in lookups +> ``` + +| Name | Type | Description | +| ----------- | ----------------------------- | ---------------------------- | +| `name` | unicode | Name of the table to remove. | +| **RETURNS** | [`Table`](/api/lookups#table) | The removed table. | + +## Lookups.has_table {#has_table tag="method"} + +Check if the lookups contain a table of a given name. Equivalent to +[`Lookups.__contains__`](/api/lookups#contains). + +> #### Example +> +> ```python +> lookups = Lookups() +> lookups.add_table("some_table") +> assert lookups.has_table("some_table") +> ``` + +| Name | Type | Description | +| ----------- | ------- | ----------------------------------------------- | +| `name` | unicode | Name of the table. | +| **RETURNS** | bool | Whether a table of that name is in the lookups. | + +## Lookups.to_bytes {#to_bytes tag="method"} + +Serialize the lookups to a bytestring. + +> #### Example +> +> ```python +> lookup_bytes = lookups.to_bytes() +> ``` + +| Name | Type | Description | +| ----------- | ----- | ----------------------- | +| **RETURNS** | bytes | The serialized lookups. | + +## Lookups.from_bytes {#from_bytes tag="method"} + +Load the lookups from a bytestring. + +> #### Example +> +> ```python +> lookup_bytes = lookups.to_bytes() +> lookups = Lookups() +> lookups.from_bytes(lookup_bytes) +> ``` + +| Name | Type | Description | +| ------------ | --------- | ---------------------- | +| `bytes_data` | bytes | The data to load from. | +| **RETURNS** | `Lookups` | The loaded lookups. | + +## Lookups.to_disk {#to_disk tag="method"} + +Save the lookups to a directory as `lookups.bin`. Expects a path to a directory, +which will be created if it doesn't exist. + +> #### Example +> +> ```python +> lookups.to_disk("/path/to/lookups") +> ``` + +| Name | Type | Description | +| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | + +## Lookups.from_disk {#from_disk tag="method"} + +Load lookups from a directory containing a `lookups.bin`. Will skip loading if +the file doesn't exist. + +> #### Example +> +> ```python +> from spacy.lookups import Lookups +> lookups = Lookups() +> lookups.from_disk("/path/to/lookups") +> ``` + +| Name | Type | Description | +| ----------- | ---------------- | -------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `Lookups` | The loaded lookups. | + +## Table {#table tag="class, ordererddict"} + +A table in the lookups. Subclass of `OrderedDict` that implements a slightly +more consistent and unified API. Supports all other methods and attributes of +`OrderedDict` / `dict`, and the customized methods listed here. + +### Table.\_\_init\_\_ {#table.init tag="method"} + +Initialize a new table. + +> #### Example +> +> ```python +> from spacy.lookups import Table +> table = Table(name="some_table") +> ``` + +| Name | Type | Description | +| ----------- | ------- | ---------------------------------- | +| `name` | unicode | Optional table name for reference. | +| **RETURNS** | `Table` | The newly constructed object. | + +### Table.from_dict {#table.from_dict tag="classmethod"} + +Initialize a new table from a dict. + +> #### Example +> +> ```python +> from spacy.lookups import Table +> data = {"foo": "bar", "baz": 100} +> table = Table.from_dict(data, name="some_table") +> ``` + +| Name | Type | Description | +| ----------- | ------- | ---------------------------------- | +| `data` | dict | The dictionary. | +| `name` | unicode | Optional table name for reference. | +| **RETURNS** | `Table` | The newly constructed object. | + +### Table.set {#table.set tag="key"} + +Set a new key / value pair. Same as `table[key] = value`. + +> #### Example +> +> ```python +> from spacy.lookups import Table +> table = Table() +> table.set("foo", "bar") +> assert table["foo"] == "bar" +> ``` + +| Name | Type | Description | +| ------- | ------- | ----------- | +| `key` | unicode | The key. | +| `value` | - | The value. | diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index cd21a91d6..22bfe324e 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -293,6 +293,7 @@ Load state from a binary string. | `strings` | `StringStore` | A table managing the string-to-int mapping. | | `vectors` 2 | `Vectors` | A table associating word IDs to word vectors. | | `vectors_length` | int | Number of dimensions for each word vector. | +| `lookups` | `Lookups` | The available lookup tables in this vocab. | | `writing_system` 2.1 | dict | A dict with information about the language's writing system. | ## Serialization fields {#serialization-fields} @@ -313,3 +314,4 @@ serialization by passing in the string names via the `exclude` argument. | `strings` | The strings in the [`StringStore`](/api/stringstore). | | `lexemes` | The lexeme data. | | `vectors` | The word vectors, if available. | +| `lookups` | The lookup tables, if available. | diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 3c4f09674..a05440e5a 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -90,6 +90,7 @@ { "text": "Vocab", "url": "/api/vocab" }, { "text": "StringStore", "url": "/api/stringstore" }, { "text": "Vectors", "url": "/api/vectors" }, + { "text": "Lookups", "url": "/api/lookups" }, { "text": "KnowledgeBase", "url": "/api/kb" }, { "text": "GoldParse", "url": "/api/goldparse" }, { "text": "GoldCorpus", "url": "/api/goldcorpus" },