mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	* Improve load_language_data helper * WIP: Add Lookups implementation * Start moving lemma data over to JSON * WIP: move data over for more languages * Convert more languages * Fix lemmatizer fixtures in tests * Finish conversion * Auto-format JSON files * Fix test for now * Make sure tables are stored on instance * Update docstrings * Update docstrings and errors * Update test * Add Lookups.__len__ * Add serialization methods * Add Lookups.remove_table * Use msgpack for serialization to disk * Fix file exists check * Try using OrderedDict for everything * Update .flake8 [ci skip] * Try fixing serialization * Update test_lookups.py * Update test_serialize_vocab_strings.py * Fix serialization for lookups * Fix lookups * Fix lookups * Fix lookups * Try to fix serialization * Try to fix serialization * Try to fix serialization * Try to fix serialization * Give up on serialization test * Xfail more serialization tests for 3.5 * Fix lookups for 2.7
		
			
				
	
	
		
			158 lines
		
	
	
		
			5.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			158 lines
		
	
	
		
			5.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf8
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| import srsly
 | |
| from collections import OrderedDict
 | |
| 
 | |
| from .errors import Errors
 | |
| from .util import SimpleFrozenDict, ensure_path
 | |
| 
 | |
| 
 | |
| class Lookups(object):
 | |
|     """Container for large lookup tables and dictionaries, e.g. lemmatization
 | |
|     data or tokenizer exception lists. Lookups are available via vocab.lookups,
 | |
|     so they can be accessed before the pipeline components are applied (e.g.
 | |
|     in the tokenizer and lemmatizer), as well as within the pipeline components
 | |
|     via doc.vocab.lookups.
 | |
| 
 | |
|     Important note: At the moment, this class only performs a very basic
 | |
|     dictionary lookup. We're planning to replace this with a more efficient
 | |
|     implementation. See #3971 for details.
 | |
|     """
 | |
| 
 | |
|     def __init__(self):
 | |
|         """Initialize the Lookups object.
 | |
| 
 | |
|         RETURNS (Lookups): The newly created object.
 | |
|         """
 | |
|         self._tables = OrderedDict()
 | |
| 
 | |
|     def __contains__(self, name):
 | |
|         """Check if the lookups contain a table of a given name. Delegates to
 | |
|         Lookups.has_table.
 | |
| 
 | |
|         name (unicode): Name of the table.
 | |
|         RETURNS (bool): Whether a table of that name exists.
 | |
|         """
 | |
|         return self.has_table(name)
 | |
| 
 | |
|     def __len__(self):
 | |
|         """RETURNS (int): The number of tables in the lookups."""
 | |
|         return len(self._tables)
 | |
| 
 | |
|     @property
 | |
|     def tables(self):
 | |
|         """RETURNS (list): Names of all tables in the lookups."""
 | |
|         return list(self._tables.keys())
 | |
| 
 | |
|     def add_table(self, name, data=SimpleFrozenDict()):
 | |
|         """Add a new table to the lookups. Raises an error if the table exists.
 | |
| 
 | |
|         name (unicode): Unique name of table.
 | |
|         data (dict): Optional data to add to the table.
 | |
|         RETURNS (Table): The newly added table.
 | |
|         """
 | |
|         if name in self.tables:
 | |
|             raise ValueError(Errors.E158.format(name=name))
 | |
|         table = Table(name=name)
 | |
|         table.update(data)
 | |
|         self._tables[name] = table
 | |
|         return table
 | |
| 
 | |
|     def get_table(self, name):
 | |
|         """Get a table. Raises an error if the table doesn't exist.
 | |
| 
 | |
|         name (unicode): Name of the table.
 | |
|         RETURNS (Table): The table.
 | |
|         """
 | |
|         if name not in self._tables:
 | |
|             raise KeyError(Errors.E159.format(name=name, tables=self.tables))
 | |
|         return self._tables[name]
 | |
| 
 | |
|     def remove_table(self, name):
 | |
|         """Remove a table. Raises an error if the table doesn't exist.
 | |
| 
 | |
|         name (unicode): The name to remove.
 | |
|         RETURNS (Table): The removed table.
 | |
|         """
 | |
|         if name not in self._tables:
 | |
|             raise KeyError(Errors.E159.format(name=name, tables=self.tables))
 | |
|         return self._tables.pop(name)
 | |
| 
 | |
|     def has_table(self, name):
 | |
|         """Check if the lookups contain a table of a given name.
 | |
| 
 | |
|         name (unicode): Name of the table.
 | |
|         RETURNS (bool): Whether a table of that name exists.
 | |
|         """
 | |
|         return name in self._tables
 | |
| 
 | |
|     def to_bytes(self, exclude=tuple(), **kwargs):
 | |
|         """Serialize the lookups to a bytestring.
 | |
| 
 | |
|         exclude (list): String names of serialization fields to exclude.
 | |
|         RETURNS (bytes): The serialized Lookups.
 | |
|         """
 | |
|         return srsly.msgpack_dumps(self._tables)
 | |
| 
 | |
|     def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
 | |
|         """Load the lookups from a bytestring.
 | |
| 
 | |
|         exclude (list): String names of serialization fields to exclude.
 | |
|         RETURNS (bytes): The loaded Lookups.
 | |
|         """
 | |
|         self._tables = OrderedDict()
 | |
|         msg = srsly.msgpack_loads(bytes_data)
 | |
|         for key, value in msg.items():
 | |
|             self._tables[key] = Table.from_dict(value)
 | |
|         return self
 | |
| 
 | |
|     def to_disk(self, path, **kwargs):
 | |
|         """Save the lookups to a directory as lookups.bin.
 | |
| 
 | |
|         path (unicode / Path): The file path.
 | |
|         """
 | |
|         if len(self._tables):
 | |
|             path = ensure_path(path)
 | |
|             filepath = path / "lookups.bin"
 | |
|             with filepath.open("wb") as file_:
 | |
|                 file_.write(self.to_bytes())
 | |
| 
 | |
|     def from_disk(self, path, **kwargs):
 | |
|         """Load lookups from a directory containing a lookups.bin.
 | |
| 
 | |
|         path (unicode / Path): The file path.
 | |
|         RETURNS (Lookups): The loaded lookups.
 | |
|         """
 | |
|         path = ensure_path(path)
 | |
|         filepath = path / "lookups.bin"
 | |
|         if filepath.exists():
 | |
|             with filepath.open("rb") as file_:
 | |
|                 data = file_.read()
 | |
|             return self.from_bytes(data)
 | |
|         return self
 | |
| 
 | |
| 
 | |
| class Table(OrderedDict):
 | |
|     """A table in the lookups. Subclass of builtin dict that implements a
 | |
|     slightly more consistent and unified API.
 | |
|     """
 | |
|     @classmethod
 | |
|     def from_dict(cls, data, name=None):
 | |
|         self = cls(name=name)
 | |
|         self.update(data)
 | |
|         return self
 | |
| 
 | |
|     def __init__(self, name=None):
 | |
|         """Initialize a new table.
 | |
| 
 | |
|         name (unicode): Optional table name for reference.
 | |
|         RETURNS (Table): The newly created object.
 | |
|         """
 | |
|         OrderedDict.__init__(self)
 | |
|         self.name = name
 | |
| 
 | |
|     def set(self, key, value):
 | |
|         """Set new key/value pair. Same as table[key] = value."""
 | |
|         self[key] = value
 |