mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	* Add vector deduplication * Add `Vocab.deduplicate_vectors()` * Always run deduplication in `spacy init vectors` * Clean up a few vector-related error messages and docs examples * Always unique with numpy * Fix types
		
			
				
	
	
		
			80 lines
		
	
	
		
			2.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			80 lines
		
	
	
		
			2.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from typing import Callable, Iterator, Optional, Union, List, Dict
 | |
| from typing import Any, Iterable
 | |
| from thinc.types import Floats1d, FloatsXd
 | |
| from . import Language
 | |
| from .strings import StringStore
 | |
| from .lexeme import Lexeme
 | |
| from .lookups import Lookups
 | |
| from .morphology import Morphology
 | |
| from .tokens import Doc, Span
 | |
| from .vectors import Vectors
 | |
| from pathlib import Path
 | |
| 
 | |
| def create_vocab(
 | |
|     lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ...
 | |
| ) -> Vocab: ...
 | |
| 
 | |
| class Vocab:
 | |
|     cfg: Dict[str, Any]
 | |
|     get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]]
 | |
|     lookups: Lookups
 | |
|     morphology: Morphology
 | |
|     strings: StringStore
 | |
|     vectors: Vectors
 | |
|     writing_system: Dict[str, Any]
 | |
|     def __init__(
 | |
|         self,
 | |
|         lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ...,
 | |
|         strings: Optional[Union[List[str], StringStore]] = ...,
 | |
|         lookups: Optional[Lookups] = ...,
 | |
|         oov_prob: float = ...,
 | |
|         vectors_name: Optional[str] = ...,
 | |
|         writing_system: Dict[str, Any] = ...,
 | |
|         get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ...,
 | |
|     ) -> None: ...
 | |
|     @property
 | |
|     def lang(self) -> str: ...
 | |
|     def __len__(self) -> int: ...
 | |
|     def add_flag(
 | |
|         self, flag_getter: Callable[[str], bool], flag_id: int = ...
 | |
|     ) -> int: ...
 | |
|     def __contains__(self, key: str) -> bool: ...
 | |
|     def __iter__(self) -> Iterator[Lexeme]: ...
 | |
|     def __getitem__(self, id_or_string: Union[str, int]) -> Lexeme: ...
 | |
|     @property
 | |
|     def vectors_length(self) -> int: ...
 | |
|     def reset_vectors(
 | |
|         self, *, width: Optional[int] = ..., shape: Optional[int] = ...
 | |
|     ) -> None: ...
 | |
|     def deduplicate_vectors(self) -> None: ...
 | |
|     def prune_vectors(self, nr_row: int, batch_size: int = ...) -> Dict[str, float]: ...
 | |
|     def get_vector(
 | |
|         self,
 | |
|         orth: Union[int, str],
 | |
|         minn: Optional[int] = ...,
 | |
|         maxn: Optional[int] = ...,
 | |
|     ) -> FloatsXd: ...
 | |
|     def set_vector(self, orth: Union[int, str], vector: Floats1d) -> None: ...
 | |
|     def has_vector(self, orth: Union[int, str]) -> bool: ...
 | |
|     def to_disk(
 | |
|         self, path: Union[str, Path], *, exclude: Iterable[str] = ...
 | |
|     ) -> None: ...
 | |
|     def from_disk(
 | |
|         self, path: Union[str, Path], *, exclude: Iterable[str] = ...
 | |
|     ) -> Vocab: ...
 | |
|     def to_bytes(self, *, exclude: Iterable[str] = ...) -> bytes: ...
 | |
|     def from_bytes(
 | |
|         self, bytes_data: bytes, *, exclude: Iterable[str] = ...
 | |
|     ) -> Vocab: ...
 | |
| 
 | |
| def pickle_vocab(vocab: Vocab) -> Any: ...
 | |
| def unpickle_vocab(
 | |
|     sstore: StringStore,
 | |
|     vectors: Any,
 | |
|     morphology: Any,
 | |
|     _unused_object: Any,
 | |
|     lex_attr_getters: Any,
 | |
|     lookups: Any,
 | |
|     get_noun_chunks: Any,
 | |
| ) -> Vocab: ...
 |