mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 05:31:15 +03:00 
			
		
		
		
	Merge pull request #12842 from svlandeg/sync_v4
Sync v4 with latest from master and develop
This commit is contained in:
		
						commit
						eaaac5a08c
					
				
							
								
								
									
										6
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -45,6 +45,12 @@ jobs: | |||
|         run: | | ||||
|           python -m pip install flake8==5.0.4 | ||||
|           python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics | ||||
|       - name: cython-lint | ||||
|         run: | | ||||
|           python -m pip install cython-lint -c requirements.txt | ||||
|           # E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment | ||||
|           cython-lint spacy --ignore E501,W291,E266 | ||||
| 
 | ||||
|   tests: | ||||
|     name: Test | ||||
|     needs: Validate | ||||
|  |  | |||
							
								
								
									
										2
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								Makefile
									
									
									
									
									
								
							|  | @ -1,7 +1,7 @@ | |||
| SHELL := /bin/bash | ||||
| 
 | ||||
| ifndef SPACY_EXTRAS | ||||
| override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2 | ||||
| override SPACY_EXTRAS = spacy-lookups-data==1.0.3 | ||||
| endif | ||||
| 
 | ||||
| ifndef PYVER | ||||
|  |  | |||
|  | @ -36,4 +36,5 @@ types-setuptools>=57.0.0 | |||
| types-requests | ||||
| types-setuptools>=57.0.0 | ||||
| black==22.3.0 | ||||
| cython-lint>=0.15.0; python_version >= "3.7" | ||||
| isort>=5.0,<6.0 | ||||
|  |  | |||
|  | @ -32,6 +32,7 @@ def init_vectors_cli( | |||
|     mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"), | ||||
|     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), | ||||
|     jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True), | ||||
|     attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Convert word vectors for use with spaCy. Will export an nlp object that | ||||
|  | @ -53,6 +54,7 @@ def init_vectors_cli( | |||
|         truncate=truncate, | ||||
|         prune=prune, | ||||
|         mode=mode, | ||||
|         attr=attr, | ||||
|     ) | ||||
|     msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") | ||||
|     nlp.to_disk(output_dir) | ||||
|  |  | |||
|  | @ -128,7 +128,7 @@ grad_factor = 1.0 | |||
| {% if "span_finder" in components -%} | ||||
| [components.span_finder] | ||||
| factory = "span_finder" | ||||
| max_length = null | ||||
| max_length = 25 | ||||
| min_length = null | ||||
| scorer = {"@scorers":"spacy.span_finder_scorer.v1"} | ||||
| spans_key = "sc" | ||||
|  | @ -415,7 +415,7 @@ width = ${components.tok2vec.model.encode.width} | |||
| {% if "span_finder" in components %} | ||||
| [components.span_finder] | ||||
| factory = "span_finder" | ||||
| max_length = null | ||||
| max_length = 25 | ||||
| min_length = null | ||||
| scorer = {"@scorers":"spacy.span_finder_scorer.v1"} | ||||
| spans_key = "sc" | ||||
|  |  | |||
|  | @ -1,4 +1,3 @@ | |||
| import itertools | ||||
| import uuid | ||||
| from typing import Any, Dict, List, Optional, Tuple, Union | ||||
| 
 | ||||
|  | @ -218,7 +217,7 @@ class SpanRenderer: | |||
|                     + (self.offset_step * (len(entities) - 1)) | ||||
|                 ) | ||||
|                 markup += self.span_template.format( | ||||
|                     text=token["text"], | ||||
|                     text=escape_html(token["text"]), | ||||
|                     span_slices=slices, | ||||
|                     span_starts=starts, | ||||
|                     total_height=total_height, | ||||
|  |  | |||
|  | @ -208,6 +208,9 @@ class Warnings(metaclass=ErrorsWithCodes): | |||
|     W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option " | ||||
|             "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") | ||||
|     W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.") | ||||
|     W125 = ("The StaticVectors key_attr is no longer used. To set a custom " | ||||
|             "key attribute for vectors, configure it through Vectors(attr=) or " | ||||
|             "'spacy init vectors --attr'") | ||||
| 
 | ||||
|     # v4 warning strings | ||||
|     W400 = ("`use_upper=False` is ignored, the upper layer is always enabled") | ||||
|  |  | |||
|  | @ -12,8 +12,9 @@ from .candidate import Candidate | |||
| 
 | ||||
| 
 | ||||
| cdef class KnowledgeBase: | ||||
|     """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases, | ||||
|     to support entity linking of named entities to real-world concepts. | ||||
|     """A `KnowledgeBase` instance stores unique identifiers for entities and | ||||
|     their textual aliases, to support entity linking of named entities to | ||||
|     real-world concepts. | ||||
|     This is an abstract class and requires its operations to be implemented. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/kb | ||||
|  | @ -31,7 +32,9 @@ cdef class KnowledgeBase: | |||
|         self.entity_vector_length = entity_vector_length | ||||
|         self.mem = Pool() | ||||
| 
 | ||||
|     def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]: | ||||
|     def get_candidates_batch( | ||||
|             self, mentions: SpanGroup | ||||
|     ) -> Iterable[Iterable[Candidate]]: | ||||
|         """ | ||||
|         Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the | ||||
|         entity's embedding vector. Depending on the KB implementation, further properties - such as the prior | ||||
|  | @ -52,7 +55,9 @@ cdef class KnowledgeBase: | |||
|         RETURNS (Iterable[Candidate]): Identified candidates. | ||||
|         """ | ||||
|         raise NotImplementedError( | ||||
|             Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__) | ||||
|             Errors.E1045.format( | ||||
|                 parent="KnowledgeBase", method="get_candidates", name=self.__name__ | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]: | ||||
|  | @ -70,7 +75,9 @@ cdef class KnowledgeBase: | |||
|         RETURNS (Iterable[float]): Vector for specified entity. | ||||
|         """ | ||||
|         raise NotImplementedError( | ||||
|             Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__) | ||||
|             Errors.E1045.format( | ||||
|                 parent="KnowledgeBase", method="get_vector", name=self.__name__ | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     def to_bytes(self, **kwargs) -> bytes: | ||||
|  | @ -78,7 +85,9 @@ cdef class KnowledgeBase: | |||
|         RETURNS (bytes): Current state as binary string. | ||||
|         """ | ||||
|         raise NotImplementedError( | ||||
|             Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__) | ||||
|             Errors.E1045.format( | ||||
|                 parent="KnowledgeBase", method="to_bytes", name=self.__name__ | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()): | ||||
|  | @ -87,27 +96,37 @@ cdef class KnowledgeBase: | |||
|         exclude (Tuple[str]): Properties to exclude when restoring KB. | ||||
|         """ | ||||
|         raise NotImplementedError( | ||||
|             Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__) | ||||
|             Errors.E1045.format( | ||||
|                 parent="KnowledgeBase", method="from_bytes", name=self.__name__ | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None: | ||||
|     def to_disk( | ||||
|             self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList() | ||||
|     ) -> None: | ||||
|         """ | ||||
|         Write KnowledgeBase content to disk. | ||||
|         path (Union[str, Path]): Target file path. | ||||
|         exclude (Iterable[str]): List of components to exclude. | ||||
|         """ | ||||
|         raise NotImplementedError( | ||||
|             Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__) | ||||
|             Errors.E1045.format( | ||||
|                 parent="KnowledgeBase", method="to_disk", name=self.__name__ | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None: | ||||
|     def from_disk( | ||||
|             self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList() | ||||
|     ) -> None: | ||||
|         """ | ||||
|         Load KnowledgeBase content from disk. | ||||
|         path (Union[str, Path]): Target file path. | ||||
|         exclude (Iterable[str]): List of components to exclude. | ||||
|         """ | ||||
|         raise NotImplementedError( | ||||
|             Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) | ||||
|             Errors.E1045.format( | ||||
|                 parent="KnowledgeBase", method="from_disk", name=self.__name__ | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     @property | ||||
|  |  | |||
|  | @ -55,23 +55,28 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
|     # optional data, we can let users configure a DB as the backend for this. | ||||
|     cdef object _features_table | ||||
| 
 | ||||
| 
 | ||||
|     cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil: | ||||
|         """Add an entity vector to the vectors table.""" | ||||
|         cdef int64_t new_index = self._vectors_table.size() | ||||
|         self._vectors_table.push_back(entity_vector) | ||||
|         return new_index | ||||
| 
 | ||||
| 
 | ||||
|     cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq, | ||||
|                                      int32_t vector_index, int feats_row) nogil: | ||||
|     cdef inline int64_t c_add_entity( | ||||
|         self, | ||||
|         hash_t entity_hash, | ||||
|         float freq, | ||||
|         int32_t vector_index, | ||||
|         int feats_row | ||||
|     ) nogil: | ||||
|         """Add an entry to the vector of entries. | ||||
|         After calling this method, make sure to update also the _entry_index using the return value""" | ||||
|         After calling this method, make sure to update also the _entry_index | ||||
|         using the return value""" | ||||
|         # This is what we'll map the entity hash key to. It's where the entry will sit | ||||
|         # in the vector of entries, so we can get it later. | ||||
|         cdef int64_t new_index = self._entries.size() | ||||
| 
 | ||||
|         # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642 | ||||
|         # Avoid struct initializer to enable nogil, cf. | ||||
|         # https://github.com/cython/cython/issues/1642 | ||||
|         cdef KBEntryC entry | ||||
|         entry.entity_hash = entity_hash | ||||
|         entry.vector_index = vector_index | ||||
|  | @ -81,11 +86,17 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
|         self._entries.push_back(entry) | ||||
|         return new_index | ||||
| 
 | ||||
|     cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil: | ||||
|         """Connect a mention to a list of potential entities with their prior probabilities . | ||||
|         After calling this method, make sure to update also the _alias_index using the return value""" | ||||
|         # This is what we'll map the alias hash key to. It's where the alias will be defined | ||||
|         # in the vector of aliases. | ||||
|     cdef inline int64_t c_add_aliases( | ||||
|         self, | ||||
|         hash_t alias_hash, | ||||
|         vector[int64_t] entry_indices, | ||||
|         vector[float] probs | ||||
|     ) nogil: | ||||
|         """Connect a mention to a list of potential entities with their prior | ||||
|         probabilities. After calling this method, make sure to update also the | ||||
|         _alias_index using the return value""" | ||||
|         # This is what we'll map the alias hash key to. It's where the alias will be | ||||
|         # defined in the vector of aliases. | ||||
|         cdef int64_t new_index = self._aliases_table.size() | ||||
| 
 | ||||
|         # Avoid struct initializer to enable nogil | ||||
|  | @ -98,8 +109,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
| 
 | ||||
|     cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil: | ||||
|         """ | ||||
|         Initializing the vectors and making sure the first element of each vector is a dummy, | ||||
|         because the PreshMap maps pointing to indices in these vectors can not contain 0 as value | ||||
|         Initializing the vectors and making sure the first element of each vector is a | ||||
|         dummy, because the PreshMap maps pointing to indices in these vectors can not | ||||
|         contain 0 as value. | ||||
|         cf. https://github.com/explosion/preshed/issues/17 | ||||
|         """ | ||||
|         cdef int32_t dummy_value = 0 | ||||
|  | @ -130,12 +142,18 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
| cdef class Writer: | ||||
|     cdef FILE* _fp | ||||
| 
 | ||||
|     cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1 | ||||
|     cdef int write_header( | ||||
|         self, int64_t nr_entries, int64_t entity_vector_length | ||||
|     ) except -1 | ||||
|     cdef int write_vector_element(self, float element) except -1 | ||||
|     cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1 | ||||
|     cdef int write_entry( | ||||
|         self, hash_t entry_hash, float entry_freq, int32_t vector_index | ||||
|     ) except -1 | ||||
| 
 | ||||
|     cdef int write_alias_length(self, int64_t alias_length) except -1 | ||||
|     cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1 | ||||
|     cdef int write_alias_header( | ||||
|         self, hash_t alias_hash, int64_t candidate_length | ||||
|     ) except -1 | ||||
|     cdef int write_alias(self, int64_t entry_index, float prob) except -1 | ||||
| 
 | ||||
|     cdef int _write(self, void* value, size_t size) except -1 | ||||
|  | @ -143,12 +161,18 @@ cdef class Writer: | |||
| cdef class Reader: | ||||
|     cdef FILE* _fp | ||||
| 
 | ||||
|     cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1 | ||||
|     cdef int read_header( | ||||
|         self, int64_t* nr_entries, int64_t* entity_vector_length | ||||
|     ) except -1 | ||||
|     cdef int read_vector_element(self, float* element) except -1 | ||||
|     cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1 | ||||
|     cdef int read_entry( | ||||
|         self, hash_t* entity_hash, float* freq, int32_t* vector_index | ||||
|     ) except -1 | ||||
| 
 | ||||
|     cdef int read_alias_length(self, int64_t* alias_length) except -1 | ||||
|     cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1 | ||||
|     cdef int read_alias_header( | ||||
|         self, hash_t* alias_hash, int64_t* candidate_length | ||||
|     ) except -1 | ||||
|     cdef int read_alias(self, int64_t* entry_index, float* prob) except -1 | ||||
| 
 | ||||
|     cdef int _read(self, void* value, size_t size) except -1 | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| # cython: infer_types=True, profile=True | ||||
| from typing import Any, Callable, Dict, Iterable, Union | ||||
| from typing import Any, Callable, Dict, Iterable | ||||
| 
 | ||||
| import srsly | ||||
| 
 | ||||
|  | @ -27,8 +27,9 @@ from .candidate import InMemoryCandidate | |||
| 
 | ||||
| 
 | ||||
| cdef class InMemoryLookupKB(KnowledgeBase): | ||||
|     """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases, | ||||
|     to support entity linking of named entities to real-world concepts. | ||||
|     """An `InMemoryLookupKB` instance stores unique identifiers for entities | ||||
|     and their textual aliases, to support entity linking of named entities to | ||||
|     real-world concepts. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/inmemorylookupkb | ||||
|     """ | ||||
|  | @ -71,7 +72,8 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
| 
 | ||||
|     def add_entity(self, str entity, float freq, vector[float] entity_vector): | ||||
|         """ | ||||
|         Add an entity to the KB, optionally specifying its log probability based on corpus frequency | ||||
|         Add an entity to the KB, optionally specifying its log probability | ||||
|         based on corpus frequency. | ||||
|         Return the hash of the entity ID/name at the end. | ||||
|         """ | ||||
|         cdef hash_t entity_hash = self.vocab.strings.add(entity) | ||||
|  | @ -83,14 +85,20 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
| 
 | ||||
|         # Raise an error if the provided entity vector is not of the correct length | ||||
|         if len(entity_vector) != self.entity_vector_length: | ||||
|             raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length)) | ||||
|             raise ValueError( | ||||
|                 Errors.E141.format( | ||||
|                     found=len(entity_vector), required=self.entity_vector_length | ||||
|                 ) | ||||
|             ) | ||||
| 
 | ||||
|         vector_index = self.c_add_vector(entity_vector=entity_vector) | ||||
| 
 | ||||
|         new_index = self.c_add_entity(entity_hash=entity_hash, | ||||
|                                       freq=freq, | ||||
|                                       vector_index=vector_index, | ||||
|                                       feats_row=-1)  # Features table currently not implemented | ||||
|         new_index = self.c_add_entity( | ||||
|             entity_hash=entity_hash, | ||||
|             freq=freq, | ||||
|             vector_index=vector_index, | ||||
|             feats_row=-1 | ||||
|         )  # Features table currently not implemented | ||||
|         self._entry_index[entity_hash] = new_index | ||||
| 
 | ||||
|         return entity_hash | ||||
|  | @ -115,7 +123,12 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
|             else: | ||||
|                 entity_vector = vector_list[i] | ||||
|                 if len(entity_vector) != self.entity_vector_length: | ||||
|                     raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length)) | ||||
|                     raise ValueError( | ||||
|                         Errors.E141.format( | ||||
|                             found=len(entity_vector), | ||||
|                             required=self.entity_vector_length | ||||
|                         ) | ||||
|                     ) | ||||
| 
 | ||||
|                 entry.entity_hash = entity_hash | ||||
|                 entry.freq = freq_list[i] | ||||
|  | @ -149,11 +162,15 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
|         previous_alias_nr = self.get_size_aliases() | ||||
|         # Throw an error if the length of entities and probabilities are not the same | ||||
|         if not len(entities) == len(probabilities): | ||||
|             raise ValueError(Errors.E132.format(alias=alias, | ||||
|                                                 entities_length=len(entities), | ||||
|                                                 probabilities_length=len(probabilities))) | ||||
|             raise ValueError( | ||||
|                 Errors.E132.format( | ||||
|                     alias=alias, | ||||
|                     entities_length=len(entities), | ||||
|                     probabilities_length=len(probabilities)) | ||||
|             ) | ||||
| 
 | ||||
|         # Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors) | ||||
|         # Throw an error if the probabilities sum up to more than 1 (allow for | ||||
|         # some rounding errors) | ||||
|         prob_sum = sum(probabilities) | ||||
|         if prob_sum > 1.00001: | ||||
|             raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum)) | ||||
|  | @ -170,40 +187,47 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
| 
 | ||||
|         for entity, prob in zip(entities, probabilities): | ||||
|             entity_hash = self.vocab.strings[entity] | ||||
|             if not entity_hash in self._entry_index: | ||||
|             if entity_hash not in self._entry_index: | ||||
|                 raise ValueError(Errors.E134.format(entity=entity)) | ||||
| 
 | ||||
|             entry_index = <int64_t>self._entry_index.get(entity_hash) | ||||
|             entry_indices.push_back(int(entry_index)) | ||||
|             probs.push_back(float(prob)) | ||||
| 
 | ||||
|         new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs) | ||||
|         new_index = self.c_add_aliases( | ||||
|             alias_hash=alias_hash, entry_indices=entry_indices, probs=probs | ||||
|         ) | ||||
|         self._alias_index[alias_hash] = new_index | ||||
| 
 | ||||
|         if previous_alias_nr + 1 != self.get_size_aliases(): | ||||
|             raise RuntimeError(Errors.E891.format(alias=alias)) | ||||
|         return alias_hash | ||||
| 
 | ||||
|     def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False): | ||||
|     def append_alias( | ||||
|         self, str alias, str entity, float prior_prob, ignore_warnings=False | ||||
|     ): | ||||
|         """ | ||||
|         For an alias already existing in the KB, extend its potential entities with one more. | ||||
|         For an alias already existing in the KB, extend its potential entities | ||||
|         with one more. | ||||
|         Throw a warning if either the alias or the entity is unknown, | ||||
|         or when the combination is already previously recorded. | ||||
|         Throw an error if this entity+prior prob would exceed the sum of 1. | ||||
|         For efficiency, it's best to use the method `add_alias` as much as possible instead of this one. | ||||
|         For efficiency, it's best to use the method `add_alias` as much as | ||||
|         possible instead of this one. | ||||
|         """ | ||||
|         # Check if the alias exists in the KB | ||||
|         cdef hash_t alias_hash = self.vocab.strings[alias] | ||||
|         if not alias_hash in self._alias_index: | ||||
|         if alias_hash not in self._alias_index: | ||||
|             raise ValueError(Errors.E176.format(alias=alias)) | ||||
| 
 | ||||
|         # Check if the entity exists in the KB | ||||
|         cdef hash_t entity_hash = self.vocab.strings[entity] | ||||
|         if not entity_hash in self._entry_index: | ||||
|         if entity_hash not in self._entry_index: | ||||
|             raise ValueError(Errors.E134.format(entity=entity)) | ||||
|         entry_index = <int64_t>self._entry_index.get(entity_hash) | ||||
| 
 | ||||
|         # Throw an error if the prior probabilities (including the new one) sum up to more than 1 | ||||
|         # Throw an error if the prior probabilities (including the new one) | ||||
|         # sum up to more than 1 | ||||
|         alias_index = <int64_t>self._alias_index.get(alias_hash) | ||||
|         alias_entry = self._aliases_table[alias_index] | ||||
|         current_sum = sum([p for p in alias_entry.probs]) | ||||
|  | @ -236,12 +260,13 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
| 
 | ||||
|     def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]: | ||||
|         """ | ||||
|         Return candidate entities for an alias. Each candidate defines the entity, the original alias, | ||||
|         and the prior probability of that alias resolving to that entity. | ||||
|         Return candidate entities for an alias. Each candidate defines the | ||||
|         entity, the original alias, and the prior probability of that alias | ||||
|         resolving to that entity. | ||||
|         If the alias is not known in the KB, and empty list is returned. | ||||
|         """ | ||||
|         cdef hash_t alias_hash = self.vocab.strings[alias] | ||||
|         if not alias_hash in self._alias_index: | ||||
|         if alias_hash not in self._alias_index: | ||||
|             return [] | ||||
|         alias_index = <int64_t>self._alias_index.get(alias_hash) | ||||
|         alias_entry = self._aliases_table[alias_index] | ||||
|  | @ -270,8 +295,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
|         return self._vectors_table[self._entries[entry_index].vector_index] | ||||
| 
 | ||||
|     def get_prior_prob(self, str entity, str alias): | ||||
|         """ Return the prior probability of a given alias being linked to a given entity, | ||||
|         or return 0.0 when this combination is not known in the knowledge base""" | ||||
|         """ Return the prior probability of a given alias being linked to a | ||||
|         given entity, or return 0.0 when this combination is not known in the | ||||
|         knowledge base.""" | ||||
|         cdef hash_t alias_hash = self.vocab.strings[alias] | ||||
|         cdef hash_t entity_hash = self.vocab.strings[entity] | ||||
| 
 | ||||
|  | @ -282,7 +308,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
|         entry_index = self._entry_index[entity_hash] | ||||
| 
 | ||||
|         alias_entry = self._aliases_table[alias_index] | ||||
|         for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs): | ||||
|         for (entry_index, prior_prob) in zip( | ||||
|             alias_entry.entry_indices, alias_entry.probs | ||||
|         ): | ||||
|             if self._entries[entry_index].entity_hash == entity_hash: | ||||
|                 return prior_prob | ||||
| 
 | ||||
|  | @ -295,13 +323,19 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
|         """Serialize the current state to a binary string. | ||||
|         """ | ||||
|         def serialize_header(): | ||||
|             header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length) | ||||
|             header = ( | ||||
|                 self.get_size_entities(), | ||||
|                 self.get_size_aliases(), | ||||
|                 self.entity_vector_length | ||||
|             ) | ||||
|             return srsly.json_dumps(header) | ||||
| 
 | ||||
|         def serialize_entries(): | ||||
|             i = 1 | ||||
|             tuples = [] | ||||
|             for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]): | ||||
|             for entry_hash, entry_index in sorted( | ||||
|                 self._entry_index.items(), key=lambda x: x[1] | ||||
|             ): | ||||
|                 entry = self._entries[entry_index] | ||||
|                 assert entry.entity_hash == entry_hash | ||||
|                 assert entry_index == i | ||||
|  | @ -314,7 +348,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
|             headers = [] | ||||
|             indices_lists = [] | ||||
|             probs_lists = [] | ||||
|             for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]): | ||||
|             for alias_hash, alias_index in sorted( | ||||
|                 self._alias_index.items(), key=lambda x: x[1] | ||||
|             ): | ||||
|                 alias = self._aliases_table[alias_index] | ||||
|                 assert alias_index == i | ||||
|                 candidate_length = len(alias.entry_indices) | ||||
|  | @ -372,7 +408,7 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
|             indices = srsly.json_loads(all_data[1]) | ||||
|             probs = srsly.json_loads(all_data[2]) | ||||
|             for header, indices, probs in zip(headers, indices, probs): | ||||
|                 alias_hash, candidate_length = header | ||||
|                 alias_hash, _candidate_length = header | ||||
|                 alias.entry_indices = indices | ||||
|                 alias.probs = probs | ||||
|                 self._aliases_table[i] = alias | ||||
|  | @ -421,10 +457,14 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
|                 writer.write_vector_element(element) | ||||
|             i = i+1 | ||||
| 
 | ||||
|         # dumping the entry records in the order in which they are in the _entries vector. | ||||
|         # index 0 is a dummy object not stored in the _entry_index and can be ignored. | ||||
|         # dumping the entry records in the order in which they are in the | ||||
|         # _entries vector. | ||||
|         # index 0 is a dummy object not stored in the _entry_index and can | ||||
|         # be ignored. | ||||
|         i = 1 | ||||
|         for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]): | ||||
|         for entry_hash, entry_index in sorted( | ||||
|             self._entry_index.items(), key=lambda x: x[1] | ||||
|         ): | ||||
|             entry = self._entries[entry_index] | ||||
|             assert entry.entity_hash == entry_hash | ||||
|             assert entry_index == i | ||||
|  | @ -436,7 +476,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
|         # dumping the aliases in the order in which they are in the _alias_index vector. | ||||
|         # index 0 is a dummy object not stored in the _aliases_table and can be ignored. | ||||
|         i = 1 | ||||
|         for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]): | ||||
|         for alias_hash, alias_index in sorted( | ||||
|                 self._alias_index.items(), key=lambda x: x[1] | ||||
|         ): | ||||
|             alias = self._aliases_table[alias_index] | ||||
|             assert alias_index == i | ||||
| 
 | ||||
|  | @ -542,7 +584,8 @@ cdef class Writer: | |||
|     def __init__(self, path): | ||||
|         assert isinstance(path, Path) | ||||
|         content = bytes(path) | ||||
|         cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content | ||||
|         cdef bytes bytes_loc = content.encode('utf8') \ | ||||
|             if type(content) == str else content | ||||
|         self._fp = fopen(<char*>bytes_loc, 'wb') | ||||
|         if not self._fp: | ||||
|             raise IOError(Errors.E146.format(path=path)) | ||||
|  | @ -552,14 +595,18 @@ cdef class Writer: | |||
|         cdef size_t status = fclose(self._fp) | ||||
|         assert status == 0 | ||||
| 
 | ||||
|     cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1: | ||||
|     cdef int write_header( | ||||
|         self, int64_t nr_entries, int64_t entity_vector_length | ||||
|     ) except -1: | ||||
|         self._write(&nr_entries, sizeof(nr_entries)) | ||||
|         self._write(&entity_vector_length, sizeof(entity_vector_length)) | ||||
| 
 | ||||
|     cdef int write_vector_element(self, float element) except -1: | ||||
|         self._write(&element, sizeof(element)) | ||||
| 
 | ||||
|     cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1: | ||||
|     cdef int write_entry( | ||||
|         self, hash_t entry_hash, float entry_freq, int32_t vector_index | ||||
|     ) except -1: | ||||
|         self._write(&entry_hash, sizeof(entry_hash)) | ||||
|         self._write(&entry_freq, sizeof(entry_freq)) | ||||
|         self._write(&vector_index, sizeof(vector_index)) | ||||
|  | @ -568,7 +615,9 @@ cdef class Writer: | |||
|     cdef int write_alias_length(self, int64_t alias_length) except -1: | ||||
|         self._write(&alias_length, sizeof(alias_length)) | ||||
| 
 | ||||
|     cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1: | ||||
|     cdef int write_alias_header( | ||||
|         self, hash_t alias_hash, int64_t candidate_length | ||||
|     ) except -1: | ||||
|         self._write(&alias_hash, sizeof(alias_hash)) | ||||
|         self._write(&candidate_length, sizeof(candidate_length)) | ||||
| 
 | ||||
|  | @ -584,16 +633,19 @@ cdef class Writer: | |||
| cdef class Reader: | ||||
|     def __init__(self, path): | ||||
|         content = bytes(path) | ||||
|         cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content | ||||
|         cdef bytes bytes_loc = content.encode('utf8') \ | ||||
|             if type(content) == str else content | ||||
|         self._fp = fopen(<char*>bytes_loc, 'rb') | ||||
|         if not self._fp: | ||||
|             PyErr_SetFromErrno(IOError) | ||||
|         status = fseek(self._fp, 0, 0)  # this can be 0 if there is no header | ||||
|         fseek(self._fp, 0, 0)  # this can be 0 if there is no header | ||||
| 
 | ||||
|     def __dealloc__(self): | ||||
|         fclose(self._fp) | ||||
| 
 | ||||
|     cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1: | ||||
|     cdef int read_header( | ||||
|         self, int64_t* nr_entries, int64_t* entity_vector_length | ||||
|     ) except -1: | ||||
|         status = self._read(nr_entries, sizeof(int64_t)) | ||||
|         if status < 1: | ||||
|             if feof(self._fp): | ||||
|  | @ -613,7 +665,9 @@ cdef class Reader: | |||
|                 return 0  # end of file | ||||
|             raise IOError(Errors.E145.format(param="vector element")) | ||||
| 
 | ||||
|     cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1: | ||||
|     cdef int read_entry( | ||||
|         self, hash_t* entity_hash, float* freq, int32_t* vector_index | ||||
|     ) except -1: | ||||
|         status = self._read(entity_hash, sizeof(hash_t)) | ||||
|         if status < 1: | ||||
|             if feof(self._fp): | ||||
|  | @ -644,7 +698,9 @@ cdef class Reader: | |||
|                 return 0  # end of file | ||||
|             raise IOError(Errors.E145.format(param="alias length")) | ||||
| 
 | ||||
|     cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1: | ||||
|     cdef int read_alias_header( | ||||
|         self, hash_t* alias_hash, int64_t* candidate_length | ||||
|     ) except -1: | ||||
|         status = self._read(alias_hash, sizeof(hash_t)) | ||||
|         if status < 1: | ||||
|             if feof(self._fp): | ||||
|  |  | |||
|  | @ -740,6 +740,11 @@ class Language: | |||
|                 ) | ||||
|             ) | ||||
|         pipe = source.get_pipe(source_name) | ||||
|         # There is no actual solution here. Either the component has the right | ||||
|         # name for the source pipeline or the component has the right name for | ||||
|         # the current pipeline. This prioritizes the current pipeline. | ||||
|         if hasattr(pipe, "name"): | ||||
|             pipe.name = name | ||||
|         # Make sure the source config is interpolated so we don't end up with | ||||
|         # orphaned variables in our final config | ||||
|         source_config = source.config.interpolate() | ||||
|  | @ -817,6 +822,7 @@ class Language: | |||
|         pipe_index = self._get_pipe_index(before, after, first, last) | ||||
|         self._pipe_meta[name] = self.get_factory_meta(factory_name) | ||||
|         self._components.insert(pipe_index, (name, pipe_component)) | ||||
|         self._link_components() | ||||
|         return pipe_component | ||||
| 
 | ||||
|     def _get_pipe_index( | ||||
|  | @ -956,6 +962,7 @@ class Language: | |||
|         if old_name in self._config["initialize"]["components"]: | ||||
|             init_cfg = self._config["initialize"]["components"].pop(old_name) | ||||
|             self._config["initialize"]["components"][new_name] = init_cfg | ||||
|         self._link_components() | ||||
| 
 | ||||
|     def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]: | ||||
|         """Remove a component from the pipeline. | ||||
|  | @ -979,6 +986,7 @@ class Language: | |||
|         # Make sure the name is also removed from the set of disabled components | ||||
|         if name in self.disabled: | ||||
|             self._disabled.remove(name) | ||||
|         self._link_components() | ||||
|         return removed | ||||
| 
 | ||||
|     def disable_pipe(self, name: str) -> None: | ||||
|  | @ -1823,8 +1831,16 @@ class Language: | |||
|         # The problem is we need to do it during deserialization...And the | ||||
|         # components don't receive the pipeline then. So this does have to be | ||||
|         # here :( | ||||
|         # First, fix up all the internal component names in case they have | ||||
|         # gotten out of sync due to sourcing components from different | ||||
|         # pipelines, since find_listeners uses proc2.name for the listener | ||||
|         # map. | ||||
|         for name, proc in self.pipeline: | ||||
|             if hasattr(proc, "name"): | ||||
|                 proc.name = name | ||||
|         for i, (name1, proc1) in enumerate(self.pipeline): | ||||
|             if isinstance(proc1, ty.ListenedToComponent): | ||||
|                 proc1.listener_map = {} | ||||
|                 for name2, proc2 in self.pipeline[i + 1 :]: | ||||
|                     proc1.find_listeners(proc2) | ||||
| 
 | ||||
|  | @ -1934,7 +1950,6 @@ class Language: | |||
|         # Later we replace the component config with the raw config again. | ||||
|         interpolated = filled.interpolate() if not filled.is_interpolated else filled | ||||
|         pipeline = interpolated.get("components", {}) | ||||
|         sourced = util.get_sourced_components(interpolated) | ||||
|         # If components are loaded from a source (existing models), we cache | ||||
|         # them here so they're only loaded once | ||||
|         source_nlps = {} | ||||
|  | @ -1962,6 +1977,7 @@ class Language: | |||
|                         raw_config=raw_config, | ||||
|                     ) | ||||
|                 else: | ||||
|                     assert "source" in pipe_cfg | ||||
|                     # We need the sourced components to reference the same | ||||
|                     # vocab without modifying the current vocab state **AND** | ||||
|                     # we still want to load the source model vectors to perform | ||||
|  | @ -1981,6 +1997,10 @@ class Language: | |||
|                     source_name = pipe_cfg.get("component", pipe_name) | ||||
|                     listeners_replaced = False | ||||
|                     if "replace_listeners" in pipe_cfg: | ||||
|                         # Make sure that the listened-to component has the | ||||
|                         # state of the source pipeline listener map so that the | ||||
|                         # replace_listeners method below works as intended. | ||||
|                         source_nlps[model]._link_components() | ||||
|                         for name, proc in source_nlps[model].pipeline: | ||||
|                             if source_name in getattr(proc, "listening_components", []): | ||||
|                                 source_nlps[model].replace_listeners( | ||||
|  | @ -1992,6 +2012,8 @@ class Language: | |||
|                         nlp.add_pipe( | ||||
|                             source_name, source=source_nlps[model], name=pipe_name | ||||
|                         ) | ||||
|                         # At this point after nlp.add_pipe, the listener map | ||||
|                         # corresponds to the new pipeline. | ||||
|                     if model not in source_nlp_vectors_hashes: | ||||
|                         source_nlp_vectors_hashes[model] = hash( | ||||
|                             source_nlps[model].vocab.vectors.to_bytes( | ||||
|  | @ -2046,27 +2068,6 @@ class Language: | |||
|                 raise ValueError( | ||||
|                     Errors.E942.format(name="pipeline_creation", value=type(nlp)) | ||||
|                 ) | ||||
|         # Detect components with listeners that are not frozen consistently | ||||
|         for name, proc in nlp.pipeline: | ||||
|             if isinstance(proc, ty.ListenedToComponent): | ||||
|                 # Remove listeners not in the pipeline | ||||
|                 listener_names = proc.listening_components | ||||
|                 unused_listener_names = [ | ||||
|                     ll for ll in listener_names if ll not in nlp.pipe_names | ||||
|                 ] | ||||
|                 for listener_name in unused_listener_names: | ||||
|                     for listener in proc.listener_map.get(listener_name, []): | ||||
|                         proc.remove_listener(listener, listener_name) | ||||
| 
 | ||||
|                 for listener_name in proc.listening_components: | ||||
|                     # e.g. tok2vec/transformer | ||||
|                     # If it's a component sourced from another pipeline, we check if | ||||
|                     # the tok2vec listeners should be replaced with standalone tok2vec | ||||
|                     # models (e.g. so component can be frozen without its performance | ||||
|                     # degrading when other components/tok2vec are updated) | ||||
|                     paths = sourced.get(listener_name, {}).get("replace_listeners", []) | ||||
|                     if paths: | ||||
|                         nlp.replace_listeners(name, listener_name, paths) | ||||
|         return nlp | ||||
| 
 | ||||
|     def replace_listeners( | ||||
|  | @ -2081,7 +2082,7 @@ class Language: | |||
|         useful when training a pipeline with components sourced from an existing | ||||
|         pipeline: if multiple components (e.g. tagger, parser, NER) listen to | ||||
|         the same tok2vec component, but some of them are frozen and not updated, | ||||
|         their performance may degrade significally as the tok2vec component is | ||||
|         their performance may degrade significantly as the tok2vec component is | ||||
|         updated with new data. To prevent this, listeners can be replaced with | ||||
|         a standalone tok2vec layer that is owned by the component and doesn't | ||||
|         change if the component isn't updated. | ||||
|  |  | |||
|  | @ -1,7 +1,6 @@ | |||
| # cython: embedsignature=True | ||||
| # Compiler crashes on memory view coercion without this. Should report bug. | ||||
| cimport numpy as np | ||||
| from cython.view cimport array as cvarray | ||||
| from libc.string cimport memset | ||||
| 
 | ||||
| np.import_array() | ||||
|  | @ -35,7 +34,7 @@ from .typedefs cimport attr_t, flags_t | |||
| from .attrs import intify_attrs | ||||
| from .errors import Errors, Warnings | ||||
| 
 | ||||
| OOV_RANK = 0xffffffffffffffff # UINT64_MAX | ||||
| OOV_RANK = 0xffffffffffffffff  # UINT64_MAX | ||||
| memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) | ||||
| EMPTY_LEXEME.id = OOV_RANK | ||||
| 
 | ||||
|  | @ -105,7 +104,7 @@ cdef class Lexeme: | |||
|             if isinstance(value, float): | ||||
|                 continue | ||||
|             elif isinstance(value, (int, long)): | ||||
|                  Lexeme.set_struct_attr(self.c, attr, value) | ||||
|                 Lexeme.set_struct_attr(self.c, attr, value) | ||||
|             else: | ||||
|                 Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value)) | ||||
| 
 | ||||
|  | @ -137,10 +136,12 @@ cdef class Lexeme: | |||
|         if hasattr(other, "orth"): | ||||
|             if self.c.orth == other.orth: | ||||
|                 return 1.0 | ||||
|         elif hasattr(other, "__len__") and len(other) == 1 \ | ||||
|         and hasattr(other[0], "orth"): | ||||
|             if self.c.orth == other[0].orth: | ||||
|                 return 1.0 | ||||
|         elif ( | ||||
|             hasattr(other, "__len__") and len(other) == 1 | ||||
|             and hasattr(other[0], "orth") | ||||
|             and self.c.orth == other[0].orth | ||||
|         ): | ||||
|             return 1.0 | ||||
|         if self.vector_norm == 0 or other.vector_norm == 0: | ||||
|             warnings.warn(Warnings.W008.format(obj="Lexeme")) | ||||
|             return 0.0 | ||||
|  | @ -149,7 +150,7 @@ cdef class Lexeme: | |||
|         result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) | ||||
|         # ensure we get a scalar back (numpy does this automatically but cupy doesn't) | ||||
|         return result.item() | ||||
|      | ||||
| 
 | ||||
|     @property | ||||
|     def has_vector(self): | ||||
|         """RETURNS (bool): Whether a word vector is associated with the object. | ||||
|  |  | |||
|  | @ -108,7 +108,7 @@ cdef class DependencyMatcher: | |||
|         key (str): The match ID. | ||||
|         RETURNS (bool): Whether the matcher contains rules for this match ID. | ||||
|         """ | ||||
|         return self.has_key(key) | ||||
|         return self.has_key(key)  # no-cython-lint: W601 | ||||
| 
 | ||||
|     def _validate_input(self, pattern, key): | ||||
|         idx = 0 | ||||
|  | @ -264,7 +264,7 @@ cdef class DependencyMatcher: | |||
| 
 | ||||
|     def remove(self, key): | ||||
|         key = self._normalize_key(key) | ||||
|         if not key in self._patterns: | ||||
|         if key not in self._patterns: | ||||
|             raise ValueError(Errors.E175.format(key=key)) | ||||
|         self._patterns.pop(key) | ||||
|         self._raw_patterns.pop(key) | ||||
|  | @ -382,7 +382,7 @@ cdef class DependencyMatcher: | |||
|             return [] | ||||
|         return [doc[node].head] | ||||
| 
 | ||||
|     def _gov(self,doc,node): | ||||
|     def _gov(self, doc, node): | ||||
|         return list(doc[node].children) | ||||
| 
 | ||||
|     def _dep_chain(self, doc, node): | ||||
|  | @ -443,7 +443,7 @@ cdef class DependencyMatcher: | |||
| 
 | ||||
|     def _right_child(self, doc, node): | ||||
|         return [child for child in doc[node].rights] | ||||
|      | ||||
| 
 | ||||
|     def _left_child(self, doc, node): | ||||
|         return [child for child in doc[node].lefts] | ||||
| 
 | ||||
|  | @ -461,7 +461,7 @@ cdef class DependencyMatcher: | |||
|         if doc[node].head.i > node: | ||||
|             return [doc[node].head] | ||||
|         return [] | ||||
|      | ||||
| 
 | ||||
|     def _left_parent(self, doc, node): | ||||
|         if doc[node].head.i < node: | ||||
|             return [doc[node].head] | ||||
|  |  | |||
|  | @ -12,25 +12,13 @@ import warnings | |||
| 
 | ||||
| import srsly | ||||
| 
 | ||||
| from ..attrs cimport ( | ||||
|     DEP, | ||||
|     ENT_IOB, | ||||
|     ID, | ||||
|     LEMMA, | ||||
|     MORPH, | ||||
|     NULL_ATTR, | ||||
|     ORTH, | ||||
|     POS, | ||||
|     TAG, | ||||
|     attr_id_t, | ||||
| ) | ||||
| from ..attrs cimport DEP, ENT_IOB, ID, LEMMA, MORPH, NULL_ATTR, POS, TAG | ||||
| from ..structs cimport TokenC | ||||
| from ..tokens.doc cimport Doc, get_token_attr_for_matcher | ||||
| from ..tokens.morphanalysis cimport MorphAnalysis | ||||
| from ..tokens.span cimport Span | ||||
| from ..tokens.token cimport Token | ||||
| from ..typedefs cimport attr_t | ||||
| from ..vocab cimport Vocab | ||||
| 
 | ||||
| from ..errors import Errors, MatchPatternError, Warnings | ||||
| from ..schemas import validate_token_pattern | ||||
|  | @ -42,7 +30,6 @@ from ..attrs import IDS | |||
| from ..errors import Errors, MatchPatternError, Warnings | ||||
| from ..schemas import validate_token_pattern | ||||
| from ..strings import get_string_id | ||||
| from ..util import registry | ||||
| from .levenshtein import levenshtein_compare | ||||
| 
 | ||||
| DEF PADDING = 5 | ||||
|  | @ -93,9 +80,9 @@ cdef class Matcher: | |||
|         key (str): The match ID. | ||||
|         RETURNS (bool): Whether the matcher contains rules for this match ID. | ||||
|         """ | ||||
|         return self.has_key(key) | ||||
|         return self.has_key(key)  # no-cython-lint: W601 | ||||
| 
 | ||||
|     def add(self, key, patterns, *, on_match=None, greedy: str=None): | ||||
|     def add(self, key, patterns, *, on_match=None, greedy: str = None): | ||||
|         """Add a match-rule to the matcher. A match-rule consists of: an ID | ||||
|         key, an on_match callback, and one or more patterns. | ||||
| 
 | ||||
|  | @ -149,8 +136,13 @@ cdef class Matcher: | |||
|         key = self._normalize_key(key) | ||||
|         for pattern in patterns: | ||||
|             try: | ||||
|                 specs = _preprocess_pattern(pattern, self.vocab, | ||||
|                     self._extensions, self._extra_predicates, self._fuzzy_compare) | ||||
|                 specs = _preprocess_pattern( | ||||
|                     pattern, | ||||
|                     self.vocab, | ||||
|                     self._extensions, | ||||
|                     self._extra_predicates, | ||||
|                     self._fuzzy_compare | ||||
|                 ) | ||||
|                 self.patterns.push_back(init_pattern(self.mem, key, specs)) | ||||
|                 for spec in specs: | ||||
|                     for attr, _ in spec[1]: | ||||
|  | @ -174,7 +166,7 @@ cdef class Matcher: | |||
|         key (str): The ID of the match rule. | ||||
|         """ | ||||
|         norm_key = self._normalize_key(key) | ||||
|         if not norm_key in self._patterns: | ||||
|         if norm_key not in self._patterns: | ||||
|             raise ValueError(Errors.E175.format(key=key)) | ||||
|         self._patterns.pop(norm_key) | ||||
|         self._callbacks.pop(norm_key) | ||||
|  | @ -274,8 +266,15 @@ cdef class Matcher: | |||
|         if self.patterns.empty(): | ||||
|             matches = [] | ||||
|         else: | ||||
|             matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, | ||||
|                                     extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments) | ||||
|             matches = find_matches( | ||||
|                 &self.patterns[0], | ||||
|                 self.patterns.size(), | ||||
|                 doclike, | ||||
|                 length, | ||||
|                 extensions=self._extensions, | ||||
|                 predicates=self._extra_predicates, | ||||
|                 with_alignments=with_alignments | ||||
|             ) | ||||
|         final_matches = [] | ||||
|         pairs_by_id = {} | ||||
|         # For each key, either add all matches, or only the filtered, | ||||
|  | @ -299,9 +298,9 @@ cdef class Matcher: | |||
|             memset(matched, 0, length * sizeof(matched[0])) | ||||
|             span_filter = self._filter.get(key) | ||||
|             if span_filter == "FIRST": | ||||
|                 sorted_pairs = sorted(pairs, key=lambda x: (x[0], -x[1]), reverse=False) # sort by start | ||||
|                 sorted_pairs = sorted(pairs, key=lambda x: (x[0], -x[1]), reverse=False)  # sort by start | ||||
|             elif span_filter == "LONGEST": | ||||
|                 sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True) # reverse sort by length | ||||
|                 sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True)  # reverse sort by length | ||||
|             else: | ||||
|                 raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=span_filter)) | ||||
|             for match in sorted_pairs: | ||||
|  | @ -373,7 +372,6 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e | |||
|     cdef vector[MatchC] matches | ||||
|     cdef vector[vector[MatchAlignmentC]] align_states | ||||
|     cdef vector[vector[MatchAlignmentC]] align_matches | ||||
|     cdef PatternStateC state | ||||
|     cdef int i, j, nr_extra_attr | ||||
|     cdef Pool mem = Pool() | ||||
|     output = [] | ||||
|  | @ -395,14 +393,22 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e | |||
|                 value = token.vocab.strings[value] | ||||
|             extra_attr_values[i * nr_extra_attr + index] = value | ||||
|     # Main loop | ||||
|     cdef int nr_predicate = len(predicates) | ||||
|     for i in range(length): | ||||
|         for j in range(n): | ||||
|             states.push_back(PatternStateC(patterns[j], i, 0)) | ||||
|         if with_alignments != 0: | ||||
|             align_states.resize(states.size()) | ||||
|         transition_states(states, matches, align_states, align_matches, predicate_cache, | ||||
|             doclike[i], extra_attr_values, predicates, with_alignments) | ||||
|         transition_states( | ||||
|             states, | ||||
|             matches, | ||||
|             align_states, | ||||
|             align_matches, | ||||
|             predicate_cache, | ||||
|             doclike[i], | ||||
|             extra_attr_values, | ||||
|             predicates, | ||||
|             with_alignments | ||||
|         ) | ||||
|         extra_attr_values += nr_extra_attr | ||||
|         predicate_cache += len(predicates) | ||||
|     # Handle matches that end in 0-width patterns | ||||
|  | @ -428,18 +434,28 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e | |||
|     return output | ||||
| 
 | ||||
| 
 | ||||
| cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, | ||||
|                             vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches, | ||||
|                             int8_t* cached_py_predicates, | ||||
|         Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *: | ||||
| cdef void transition_states( | ||||
|     vector[PatternStateC]& states, | ||||
|     vector[MatchC]& matches, | ||||
|     vector[vector[MatchAlignmentC]]& align_states, | ||||
|     vector[vector[MatchAlignmentC]]& align_matches, | ||||
|     int8_t* cached_py_predicates, | ||||
|     Token token, | ||||
|     const attr_t* extra_attrs, | ||||
|     py_predicates, | ||||
|     bint with_alignments | ||||
| ) except *: | ||||
|     cdef int q = 0 | ||||
|     cdef vector[PatternStateC] new_states | ||||
|     cdef vector[vector[MatchAlignmentC]] align_new_states | ||||
|     cdef int nr_predicate = len(py_predicates) | ||||
|     for i in range(states.size()): | ||||
|         if states[i].pattern.nr_py >= 1: | ||||
|             update_predicate_cache(cached_py_predicates, | ||||
|                 states[i].pattern, token, py_predicates) | ||||
|             update_predicate_cache( | ||||
|                 cached_py_predicates, | ||||
|                 states[i].pattern, | ||||
|                 token, | ||||
|                 py_predicates | ||||
|             ) | ||||
|         action = get_action(states[i], token.c, extra_attrs, | ||||
|                             cached_py_predicates) | ||||
|         if action == REJECT: | ||||
|  | @ -475,8 +491,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match | |||
|                     align_new_states.push_back(align_states[q]) | ||||
|             states[q].pattern += 1 | ||||
|             if states[q].pattern.nr_py != 0: | ||||
|                 update_predicate_cache(cached_py_predicates, | ||||
|                     states[q].pattern, token, py_predicates) | ||||
|                 update_predicate_cache( | ||||
|                     cached_py_predicates, | ||||
|                     states[q].pattern, | ||||
|                     token, | ||||
|                     py_predicates | ||||
|                 ) | ||||
|             action = get_action(states[q], token.c, extra_attrs, | ||||
|                                 cached_py_predicates) | ||||
|         # Update alignment before the transition of current state | ||||
|  | @ -492,8 +512,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match | |||
|             ent_id = get_ent_id(state.pattern) | ||||
|             if action == MATCH: | ||||
|                 matches.push_back( | ||||
|                     MatchC(pattern_id=ent_id, start=state.start, | ||||
|                             length=state.length+1)) | ||||
|                     MatchC( | ||||
|                         pattern_id=ent_id, | ||||
|                         start=state.start, | ||||
|                         length=state.length+1 | ||||
|                     ) | ||||
|                 ) | ||||
|                 # `align_matches` always corresponds to `matches` 1:1 | ||||
|                 if with_alignments != 0: | ||||
|                     align_matches.push_back(align_states[q]) | ||||
|  | @ -501,23 +525,35 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match | |||
|                 # push match without last token if length > 0 | ||||
|                 if state.length > 0: | ||||
|                     matches.push_back( | ||||
|                         MatchC(pattern_id=ent_id, start=state.start, | ||||
|                                 length=state.length)) | ||||
|                         MatchC( | ||||
|                             pattern_id=ent_id, | ||||
|                             start=state.start, | ||||
|                             length=state.length | ||||
|                         ) | ||||
|                     ) | ||||
|                     # MATCH_DOUBLE emits matches twice, | ||||
|                     # add one more to align_matches in order to keep 1:1 relationship | ||||
|                     if with_alignments != 0: | ||||
|                         align_matches.push_back(align_states[q]) | ||||
|                 # push match with last token | ||||
|                 matches.push_back( | ||||
|                     MatchC(pattern_id=ent_id, start=state.start, | ||||
|                             length=state.length+1)) | ||||
|                     MatchC( | ||||
|                         pattern_id=ent_id, | ||||
|                         start=state.start, | ||||
|                         length=state.length + 1 | ||||
|                     ) | ||||
|                 ) | ||||
|                 # `align_matches` always corresponds to `matches` 1:1 | ||||
|                 if with_alignments != 0: | ||||
|                     align_matches.push_back(align_states[q]) | ||||
|             elif action == MATCH_REJECT: | ||||
|                 matches.push_back( | ||||
|                     MatchC(pattern_id=ent_id, start=state.start, | ||||
|                             length=state.length)) | ||||
|                     MatchC( | ||||
|                         pattern_id=ent_id, | ||||
|                         start=state.start, | ||||
|                         length=state.length | ||||
|                     ) | ||||
|                 ) | ||||
|                 # `align_matches` always corresponds to `matches` 1:1 | ||||
|                 if with_alignments != 0: | ||||
|                     align_matches.push_back(align_states[q]) | ||||
|  | @ -540,8 +576,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match | |||
|             align_states.push_back(align_new_states[i]) | ||||
| 
 | ||||
| 
 | ||||
| cdef int update_predicate_cache(int8_t* cache, | ||||
|         const TokenPatternC* pattern, Token token, predicates) except -1: | ||||
| cdef int update_predicate_cache( | ||||
|     int8_t* cache, | ||||
|     const TokenPatternC* pattern, | ||||
|     Token token, | ||||
|     predicates | ||||
| ) except -1: | ||||
|     # If the state references any extra predicates, check whether they match. | ||||
|     # These are cached, so that we don't call these potentially expensive | ||||
|     # Python functions more than we need to. | ||||
|  | @ -587,10 +627,12 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states, | |||
|             else: | ||||
|                 state.pattern += 1 | ||||
| 
 | ||||
| 
 | ||||
| cdef action_t get_action(PatternStateC state, | ||||
|         const TokenC* token, const attr_t* extra_attrs, | ||||
|         const int8_t* predicate_matches) nogil: | ||||
| cdef action_t get_action( | ||||
|     PatternStateC state, | ||||
|     const TokenC * token, | ||||
|     const attr_t * extra_attrs, | ||||
|     const int8_t * predicate_matches | ||||
| ) nogil: | ||||
|     """We need to consider: | ||||
|     a) Does the token match the specification? [Yes, No] | ||||
|     b) What's the quantifier? [1, 0+, ?] | ||||
|  | @ -656,53 +698,56 @@ cdef action_t get_action(PatternStateC state, | |||
|         is_match = not is_match | ||||
|         quantifier = ONE | ||||
|     if quantifier == ONE: | ||||
|       if is_match and is_final: | ||||
|           # Yes, final: 1000 | ||||
|           return MATCH | ||||
|       elif is_match and not is_final: | ||||
|           # Yes, non-final: 0100 | ||||
|           return ADVANCE | ||||
|       elif not is_match and is_final: | ||||
|           # No, final: 0000 | ||||
|           return REJECT | ||||
|       else: | ||||
|           return REJECT | ||||
|         if is_match and is_final: | ||||
|             # Yes, final: 1000 | ||||
|             return MATCH | ||||
|         elif is_match and not is_final: | ||||
|             # Yes, non-final: 0100 | ||||
|             return ADVANCE | ||||
|         elif not is_match and is_final: | ||||
|             # No, final: 0000 | ||||
|             return REJECT | ||||
|         else: | ||||
|             return REJECT | ||||
|     elif quantifier == ZERO_PLUS: | ||||
|       if is_match and is_final: | ||||
|           # Yes, final: 1001 | ||||
|           return MATCH_EXTEND | ||||
|       elif is_match and not is_final: | ||||
|           # Yes, non-final: 0011 | ||||
|           return RETRY_EXTEND | ||||
|       elif not is_match and is_final: | ||||
|           # No, final 2000 (note: Don't include last token!) | ||||
|           return MATCH_REJECT | ||||
|       else: | ||||
|           # No, non-final 0010 | ||||
|           return RETRY | ||||
|         if is_match and is_final: | ||||
|             # Yes, final: 1001 | ||||
|             return MATCH_EXTEND | ||||
|         elif is_match and not is_final: | ||||
|             # Yes, non-final: 0011 | ||||
|             return RETRY_EXTEND | ||||
|         elif not is_match and is_final: | ||||
|             # No, final 2000 (note: Don't include last token!) | ||||
|             return MATCH_REJECT | ||||
|         else: | ||||
|             # No, non-final 0010 | ||||
|             return RETRY | ||||
|     elif quantifier == ZERO_ONE: | ||||
|       if is_match and is_final: | ||||
|           # Yes, final: 3000 | ||||
|           # To cater for a pattern ending in "?", we need to add | ||||
|           # a match both with and without the last token | ||||
|           return MATCH_DOUBLE | ||||
|       elif is_match and not is_final: | ||||
|           # Yes, non-final: 0110 | ||||
|           # We need both branches here, consider a pair like: | ||||
|           # pattern: .?b string: b | ||||
|           # If we 'ADVANCE' on the .?, we miss the match. | ||||
|           return RETRY_ADVANCE | ||||
|       elif not is_match and is_final: | ||||
|           # No, final 2000 (note: Don't include last token!) | ||||
|           return MATCH_REJECT | ||||
|       else: | ||||
|           # No, non-final 0010 | ||||
|           return RETRY | ||||
|         if is_match and is_final: | ||||
|             # Yes, final: 3000 | ||||
|             # To cater for a pattern ending in "?", we need to add | ||||
|             # a match both with and without the last token | ||||
|             return MATCH_DOUBLE | ||||
|         elif is_match and not is_final: | ||||
|             # Yes, non-final: 0110 | ||||
|             # We need both branches here, consider a pair like: | ||||
|             # pattern: .?b string: b | ||||
|             # If we 'ADVANCE' on the .?, we miss the match. | ||||
|             return RETRY_ADVANCE | ||||
|         elif not is_match and is_final: | ||||
|             # No, final 2000 (note: Don't include last token!) | ||||
|             return MATCH_REJECT | ||||
|         else: | ||||
|             # No, non-final 0010 | ||||
|             return RETRY | ||||
| 
 | ||||
| 
 | ||||
| cdef int8_t get_is_match(PatternStateC state, | ||||
|         const TokenC* token, const attr_t* extra_attrs, | ||||
|         const int8_t* predicate_matches) nogil: | ||||
| cdef int8_t get_is_match( | ||||
|     PatternStateC state, | ||||
|     const TokenC* token, | ||||
|     const attr_t* extra_attrs, | ||||
|     const int8_t* predicate_matches | ||||
| ) nogil: | ||||
|     for i in range(state.pattern.nr_py): | ||||
|         if predicate_matches[state.pattern.py_predicates[i]] == -1: | ||||
|             return 0 | ||||
|  | @ -867,7 +912,7 @@ class _FuzzyPredicate: | |||
|         self.is_extension = is_extension | ||||
|         if self.predicate not in self.operators: | ||||
|             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) | ||||
|         fuzz = self.predicate[len("FUZZY"):] # number after prefix | ||||
|         fuzz = self.predicate[len("FUZZY"):]  # number after prefix | ||||
|         self.fuzzy = int(fuzz) if fuzz else -1 | ||||
|         self.fuzzy_compare = fuzzy_compare | ||||
|         self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy) | ||||
|  | @ -1089,7 +1134,7 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types, | |||
|         elif cls == _FuzzyPredicate: | ||||
|             if isinstance(value, dict): | ||||
|                 # add predicates inside fuzzy operator | ||||
|                 fuzz = type_[len("FUZZY"):] # number after prefix | ||||
|                 fuzz = type_[len("FUZZY"):]  # number after prefix | ||||
|                 fuzzy_val = int(fuzz) if fuzz else -1 | ||||
|                 output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types, | ||||
|                                                          extra_predicates, seen_predicates, | ||||
|  | @ -1108,8 +1153,9 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types, | |||
|     return output | ||||
| 
 | ||||
| 
 | ||||
| def _get_extension_extra_predicates(spec, extra_predicates, predicate_types, | ||||
|         seen_predicates): | ||||
| def _get_extension_extra_predicates( | ||||
|     spec, extra_predicates, predicate_types, seen_predicates | ||||
| ): | ||||
|     output = [] | ||||
|     for attr, value in spec.items(): | ||||
|         if isinstance(value, dict): | ||||
|  | @ -1138,7 +1184,7 @@ def _get_operators(spec): | |||
|         return (ONE,) | ||||
|     elif spec["OP"] in lookup: | ||||
|         return lookup[spec["OP"]] | ||||
|     #Min_max {n,m} | ||||
|     # Min_max {n,m} | ||||
|     elif spec["OP"].startswith("{") and spec["OP"].endswith("}"): | ||||
|         # {n}  --> {n,n}  exactly n                 ONE,(n) | ||||
|         # {n,m}--> {n,m}  min of n, max of m        ONE,(n),ZERO_ONE,(m) | ||||
|  | @ -1149,8 +1195,8 @@ def _get_operators(spec): | |||
|         min_max = min_max if "," in min_max else f"{min_max},{min_max}" | ||||
|         n, m = min_max.split(",") | ||||
| 
 | ||||
|         #1. Either n or m is a blank string and the other is numeric -->isdigit | ||||
|         #2. Both are numeric and n <= m | ||||
|         # 1. Either n or m is a blank string and the other is numeric -->isdigit | ||||
|         # 2. Both are numeric and n <= m | ||||
|         if (not n.isdecimal() and not m.isdecimal()) or (n.isdecimal() and m.isdecimal() and int(n) > int(m)): | ||||
|             keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m " | ||||
|             raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys)) | ||||
|  |  | |||
|  | @ -2,16 +2,14 @@ | |||
| from collections import defaultdict | ||||
| from typing import List | ||||
| 
 | ||||
| from libc.stdint cimport uintptr_t | ||||
| from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set | ||||
| 
 | ||||
| import warnings | ||||
| 
 | ||||
| from ..attrs cimport DEP, LEMMA, MORPH, ORTH, POS, TAG | ||||
| from ..attrs cimport DEP, LEMMA, MORPH, POS, TAG | ||||
| 
 | ||||
| from ..attrs import IDS | ||||
| 
 | ||||
| from ..structs cimport TokenC | ||||
| from ..tokens.span cimport Span | ||||
| from ..tokens.token cimport Token | ||||
| from ..typedefs cimport attr_t | ||||
|  | @ -160,7 +158,6 @@ cdef class PhraseMatcher: | |||
|         del self._callbacks[key] | ||||
|         del self._docs[key] | ||||
| 
 | ||||
| 
 | ||||
|     def _add_from_arrays(self, key, specs, *, on_match=None): | ||||
|         """Add a preprocessed list of specs, with an optional callback. | ||||
| 
 | ||||
|  | @ -196,7 +193,6 @@ cdef class PhraseMatcher: | |||
|                 result = internal_node | ||||
|             map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL) | ||||
| 
 | ||||
| 
 | ||||
|     def add(self, key, docs, *, on_match=None): | ||||
|         """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID | ||||
|         key, a list of one or more patterns, and (optionally) an on_match callback. | ||||
|  |  | |||
|  | @ -1,3 +1,4 @@ | |||
| import warnings | ||||
| from typing import Callable, List, Optional, Sequence, Tuple, cast | ||||
| 
 | ||||
| from thinc.api import Model, Ops, registry | ||||
|  | @ -5,7 +6,8 @@ from thinc.initializers import glorot_uniform_init | |||
| from thinc.types import Floats1d, Floats2d, Ints1d, Ragged | ||||
| from thinc.util import partial | ||||
| 
 | ||||
| from ..errors import Errors | ||||
| from ..attrs import ORTH | ||||
| from ..errors import Errors, Warnings | ||||
| from ..tokens import Doc | ||||
| from ..vectors import Mode | ||||
| from ..vocab import Vocab | ||||
|  | @ -24,6 +26,8 @@ def StaticVectors( | |||
|     linear projection to control the dimensionality. If a dropout rate is | ||||
|     specified, the dropout is applied per dimension over the whole batch. | ||||
|     """ | ||||
|     if key_attr != "ORTH": | ||||
|         warnings.warn(Warnings.W125, DeprecationWarning) | ||||
|     return Model( | ||||
|         "static_vectors", | ||||
|         forward, | ||||
|  | @ -40,9 +44,9 @@ def forward( | |||
|     token_count = sum(len(doc) for doc in docs) | ||||
|     if not token_count: | ||||
|         return _handle_empty(model.ops, model.get_dim("nO")) | ||||
|     key_attr: int = model.attrs["key_attr"] | ||||
|     keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs]) | ||||
|     vocab: Vocab = docs[0].vocab | ||||
|     key_attr: int = getattr(vocab.vectors, "attr", ORTH) | ||||
|     keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs]) | ||||
|     W = cast(Floats2d, model.ops.as_contig(model.get_param("W"))) | ||||
|     if vocab.vectors.mode == Mode.default: | ||||
|         V = model.ops.asarray(vocab.vectors.data) | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| # cython: infer_types=True, cdivision=True, boundscheck=False | ||||
| from typing import Any, List, Optional, Tuple, TypeVar, cast | ||||
| from typing import Any, List, Optional, Tuple, cast | ||||
| 
 | ||||
| from libc.stdlib cimport calloc, free, realloc | ||||
| from libc.string cimport memcpy, memset | ||||
|  | @ -23,7 +23,7 @@ from thinc.api import ( | |||
| 
 | ||||
| from thinc.backends.cblas cimport CBlas, saxpy, sgemm | ||||
| 
 | ||||
| from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d | ||||
| from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d | ||||
| 
 | ||||
| from ..errors import Errors | ||||
| from ..pipeline._parser_internals import _beam_utils | ||||
|  | @ -136,7 +136,7 @@ def init( | |||
|     Y: Optional[Tuple[List[State], List[Floats2d]]] = None, | ||||
| ): | ||||
|     if X is not None: | ||||
|         docs, moves = X | ||||
|         docs, _ = X | ||||
|         model.get_ref("tok2vec").initialize(X=docs) | ||||
|     else: | ||||
|         model.get_ref("tok2vec").initialize() | ||||
|  | @ -145,7 +145,6 @@ def init( | |||
|         current_nO = model.maybe_get_dim("nO") | ||||
|         if current_nO is None or current_nO != inferred_nO: | ||||
|             model.attrs["resize_output"](model, inferred_nO) | ||||
|     nO = model.get_dim("nO") | ||||
|     nP = model.get_dim("nP") | ||||
|     nH = model.get_dim("nH") | ||||
|     nI = model.get_dim("nI") | ||||
|  | @ -192,9 +191,10 @@ class TransitionModelInputs: | |||
|         self, | ||||
|         docs: List[Doc], | ||||
|         moves: TransitionSystem, | ||||
|         actions: Optional[List[Ints1d]]=None, | ||||
|         max_moves: int=0, | ||||
|         states: Optional[List[State]]=None): | ||||
|         actions: Optional[List[Ints1d]] = None, | ||||
|         max_moves: int = 0, | ||||
|         states: Optional[List[State]] = None, | ||||
|     ): | ||||
|         """ | ||||
|         actions (Optional[List[Ints1d]]): actions to apply for each Doc. | ||||
|         docs (List[Doc]): Docs to predict transition sequences for. | ||||
|  | @ -234,12 +234,12 @@ def forward(model, inputs: TransitionModelInputs, is_train: bool): | |||
|         return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions) | ||||
|     else: | ||||
|         return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec, | ||||
|             feats, backprop_feats, seen_mask, is_train, actions=actions, | ||||
|             max_moves=inputs.max_moves) | ||||
|                                  feats, backprop_feats, seen_mask, is_train, actions=actions, | ||||
|                                  max_moves=inputs.max_moves) | ||||
| 
 | ||||
| 
 | ||||
| def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats, | ||||
|                 np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None): | ||||
|                         np.ndarray[np.npy_bool, ndim = 1] seen_mask, actions: Optional[List[Ints1d]] = None): | ||||
|     cdef vector[StateC*] c_states | ||||
|     cdef StateClass state | ||||
|     for state in states: | ||||
|  | @ -257,9 +257,10 @@ def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[State | |||
| 
 | ||||
|     return (states, scores), backprop | ||||
| 
 | ||||
| 
 | ||||
| cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states, | ||||
|                        WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None): | ||||
|     cdef int i, j | ||||
|     cdef int i | ||||
|     cdef vector[StateC *] unfinished | ||||
|     cdef ActivationsC activations = _alloc_activations(sizes) | ||||
|     cdef np.ndarray step_scores | ||||
|  | @ -276,7 +277,7 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states, | |||
|             if actions is None: | ||||
|                 # Validate actions, argmax, take action. | ||||
|                 c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes, | ||||
|                     sizes.states) | ||||
|                                    sizes.states) | ||||
|             else: | ||||
|                 c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states) | ||||
|             for i in range(sizes.states): | ||||
|  | @ -302,8 +303,9 @@ def _forward_fallback( | |||
|     backprop_feats, | ||||
|     seen_mask, | ||||
|     is_train: bool, | ||||
|     actions: Optional[List[Ints1d]]=None, | ||||
|     max_moves: int=0): | ||||
|     actions: Optional[List[Ints1d]] = None, | ||||
|     max_moves: int = 0, | ||||
| ): | ||||
|     nF = model.get_dim("nF") | ||||
|     output = model.get_ref("output") | ||||
|     hidden_b = model.get_param("hidden_b") | ||||
|  | @ -371,7 +373,7 @@ def _forward_fallback( | |||
|             for clas in set(model.attrs["unseen_classes"]): | ||||
|                 if (d_scores[:, clas] < 0).any(): | ||||
|                     model.attrs["unseen_classes"].remove(clas) | ||||
|         d_scores *= seen_mask == False | ||||
|         d_scores *= seen_mask == False  # no-cython-lint | ||||
|         # Calculate the gradients for the parameters of the output layer. | ||||
|         # The weight gemm is (nS, nO) @ (nS, nH).T | ||||
|         output.inc_grad("b", d_scores.sum(axis=0)) | ||||
|  | @ -571,13 +573,13 @@ cdef void _resize_activations(ActivationsC* A, SizesC n) nogil: | |||
|         A._max_size = n.states | ||||
|     else: | ||||
|         A.token_ids = <int*>realloc(A.token_ids, | ||||
|             n.states * n.feats * sizeof(A.token_ids[0])) | ||||
|                                     n.states * n.feats * sizeof(A.token_ids[0])) | ||||
|         A.unmaxed = <float*>realloc(A.unmaxed, | ||||
|             n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0])) | ||||
|                                     n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0])) | ||||
|         A.hiddens = <float*>realloc(A.hiddens, | ||||
|             n.states * n.hiddens * sizeof(A.hiddens[0])) | ||||
|                                     n.states * n.hiddens * sizeof(A.hiddens[0])) | ||||
|         A.is_valid = <int*>realloc(A.is_valid, | ||||
|             n.states * n.classes * sizeof(A.is_valid[0])) | ||||
|                                    n.states * n.classes * sizeof(A.is_valid[0])) | ||||
|         A._max_size = n.states | ||||
|     A._curr_size = n.states | ||||
| 
 | ||||
|  | @ -599,9 +601,9 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** | |||
|     else: | ||||
|         # Compute hidden-to-output | ||||
|         sgemm(cblas)(False, True, n.states, n.classes, n.hiddens, | ||||
|                       1.0, <const float *>A.hiddens, n.hiddens, | ||||
|                       <const float *>W.hidden_weights, n.hiddens, | ||||
|                       0.0, scores, n.classes) | ||||
|                      1.0, <const float *>A.hiddens, n.hiddens, | ||||
|                      <const float *>W.hidden_weights, n.hiddens, | ||||
|                      0.0, scores, n.classes) | ||||
|         # Add bias | ||||
|         for i in range(n.states): | ||||
|             saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1) | ||||
|  | @ -617,12 +619,12 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** | |||
|                 scores[i*n.classes+j] = min_ | ||||
| 
 | ||||
| 
 | ||||
| cdef void _sum_state_features(CBlas cblas, float* output, | ||||
|         const float* cached, const int* token_ids, SizesC n) nogil: | ||||
|     cdef int idx, b, f, i | ||||
| cdef void _sum_state_features(CBlas cblas, float* output, const float* cached, | ||||
|                               const int* token_ids, SizesC n) nogil: | ||||
|     cdef int idx, b, f | ||||
|     cdef const float* feature | ||||
|     cdef int B = n.states | ||||
|     cdef int O = n.hiddens * n.pieces | ||||
|     cdef int O = n.hiddens * n.pieces  # no-cython-lint | ||||
|     cdef int F = n.feats | ||||
|     cdef int T = n.tokens | ||||
|     padding = cached + (T * F * O) | ||||
|  | @ -637,4 +639,3 @@ cdef void _sum_state_features(CBlas cblas, float* output, | |||
|                 feature = &cached[idx] | ||||
|             saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1) | ||||
|         token_ids += F | ||||
| 
 | ||||
|  |  | |||
|  | @ -80,15 +80,13 @@ cdef class Morphology: | |||
|         out.sort(key=lambda x: x[0]) | ||||
|         return dict(out) | ||||
| 
 | ||||
| 
 | ||||
|     def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str: | ||||
|         norm_feats_string = self.FEATURE_SEP.join([ | ||||
|                 self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values]) | ||||
|             self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values]) | ||||
|             for field, values in feats.items() | ||||
|         ]) | ||||
|             ]) | ||||
|         return norm_feats_string or self.EMPTY_MORPH | ||||
| 
 | ||||
| 
 | ||||
|     cdef hash_t _add(self, features): | ||||
|         """Insert a morphological analysis in the morphology table, if not | ||||
|         already present. The morphological analysis may be provided in the UD | ||||
|  | @ -140,7 +138,7 @@ cdef class Morphology: | |||
|                     field_feature_pairs.append((field_key, value_key)) | ||||
|             else: | ||||
|                 # We could box scalar values into a list and use a common | ||||
|                 # code path to generate features but that incurs a small  | ||||
|                 # code path to generate features but that incurs a small | ||||
|                 # but measurable allocation/iteration overhead (as this | ||||
|                 # branch is taken often enough). | ||||
|                 value_key = self.strings.add(field + self.FIELD_SEP + values) | ||||
|  | @ -246,6 +244,7 @@ cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, | |||
|             n_results += 1 | ||||
|     return n_results | ||||
| 
 | ||||
| 
 | ||||
| def unpickle_morphology(strings, tags): | ||||
|     cdef Morphology morphology = Morphology(strings) | ||||
|     for tag in tags: | ||||
|  |  | |||
|  | @ -8,7 +8,7 @@ cpdef enum univ_pos_t: | |||
|     ADV = symbols.ADV | ||||
|     AUX = symbols.AUX | ||||
|     CONJ = symbols.CONJ | ||||
|     CCONJ  = symbols.CCONJ  # U20 | ||||
|     CCONJ = symbols.CCONJ  # U20 | ||||
|     DET = symbols.DET | ||||
|     INTJ = symbols.INTJ | ||||
|     NOUN = symbols.NOUN | ||||
|  |  | |||
|  | @ -46,11 +46,18 @@ cdef struct EditTreeC: | |||
|     bint is_match_node | ||||
|     NodeC inner | ||||
| 
 | ||||
| cdef inline EditTreeC edittree_new_match(len_t prefix_len, len_t suffix_len, | ||||
|         uint32_t prefix_tree, uint32_t suffix_tree): | ||||
|     cdef MatchNodeC match_node = MatchNodeC(prefix_len=prefix_len, | ||||
|             suffix_len=suffix_len, prefix_tree=prefix_tree, | ||||
|             suffix_tree=suffix_tree) | ||||
| cdef inline EditTreeC edittree_new_match( | ||||
|     len_t prefix_len, | ||||
|     len_t suffix_len, | ||||
|     uint32_t prefix_tree, | ||||
|     uint32_t suffix_tree | ||||
| ): | ||||
|     cdef MatchNodeC match_node = MatchNodeC( | ||||
|         prefix_len=prefix_len, | ||||
|         suffix_len=suffix_len, | ||||
|         prefix_tree=prefix_tree, | ||||
|         suffix_tree=suffix_tree | ||||
|     ) | ||||
|     cdef NodeC inner = NodeC(match_node=match_node) | ||||
|     return EditTreeC(is_match_node=True, inner=inner) | ||||
| 
 | ||||
|  |  | |||
|  | @ -5,8 +5,6 @@ from libc.string cimport memset | |||
| from libcpp.pair cimport pair | ||||
| from libcpp.vector cimport vector | ||||
| 
 | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from ...typedefs cimport hash_t | ||||
| 
 | ||||
| from ... import util | ||||
|  | @ -25,17 +23,16 @@ cdef LCS find_lcs(str source, str target): | |||
|     target (str): The second string. | ||||
|     RETURNS (LCS): The spans of the longest common subsequences. | ||||
|     """ | ||||
|     cdef Py_ssize_t source_len = len(source) | ||||
|     cdef Py_ssize_t target_len = len(target) | ||||
|     cdef size_t longest_align = 0; | ||||
|     cdef size_t longest_align = 0 | ||||
|     cdef int source_idx, target_idx | ||||
|     cdef LCS lcs | ||||
|     cdef Py_UCS4 source_cp, target_cp | ||||
| 
 | ||||
|     memset(&lcs, 0, sizeof(lcs)) | ||||
| 
 | ||||
|     cdef vector[size_t] prev_aligns = vector[size_t](target_len); | ||||
|     cdef vector[size_t] cur_aligns = vector[size_t](target_len); | ||||
|     cdef vector[size_t] prev_aligns = vector[size_t](target_len) | ||||
|     cdef vector[size_t] cur_aligns = vector[size_t](target_len) | ||||
| 
 | ||||
|     for (source_idx, source_cp) in enumerate(source): | ||||
|         for (target_idx, target_cp) in enumerate(target): | ||||
|  | @ -89,7 +86,7 @@ cdef class EditTrees: | |||
|         cdef LCS lcs = find_lcs(form, lemma) | ||||
| 
 | ||||
|         cdef EditTreeC tree | ||||
|         cdef uint32_t tree_id, prefix_tree, suffix_tree | ||||
|         cdef uint32_t prefix_tree, suffix_tree | ||||
|         if lcs_is_empty(lcs): | ||||
|             tree = edittree_new_subst(self.strings.add(form), self.strings.add(lemma)) | ||||
|         else: | ||||
|  | @ -108,7 +105,7 @@ cdef class EditTrees: | |||
|         return self._tree_id(tree) | ||||
| 
 | ||||
|     cdef uint32_t _tree_id(self, EditTreeC tree): | ||||
|          # If this tree has been constructed before, return its identifier. | ||||
|         # If this tree has been constructed before, return its identifier. | ||||
|         cdef hash_t hash = edittree_hash(tree) | ||||
|         cdef unordered_map[hash_t, uint32_t].iterator iter = self.map.find(hash) | ||||
|         if iter != self.map.end(): | ||||
|  | @ -289,6 +286,7 @@ def _tree2dict(tree): | |||
|         tree = tree["inner"]["subst_node"] | ||||
|     return(dict(tree)) | ||||
| 
 | ||||
| 
 | ||||
| def _dict2tree(tree): | ||||
|     errors = validate_edit_tree(tree) | ||||
|     if errors: | ||||
|  |  | |||
|  | @ -1,12 +1,8 @@ | |||
| # cython: infer_types=True | ||||
| # cython: profile=True | ||||
| cimport numpy as np | ||||
| 
 | ||||
| import numpy | ||||
| 
 | ||||
| from cpython.ref cimport Py_XDECREF, PyObject | ||||
| 
 | ||||
| from ...typedefs cimport class_t, hash_t | ||||
| from ...typedefs cimport class_t | ||||
| from .transition_system cimport Transition, TransitionSystem | ||||
| 
 | ||||
| from ...errors import Errors | ||||
|  | @ -146,7 +142,6 @@ def update_beam(TransitionSystem moves, states, golds, model, int width, beam_de | |||
|     cdef MaxViolation violn | ||||
|     pbeam = BeamBatch(moves, states, golds, width=width, density=beam_density) | ||||
|     gbeam = BeamBatch(moves, states, golds, width=width, density=0.0) | ||||
|     cdef StateClass state | ||||
|     beam_maps = [] | ||||
|     backprops = [] | ||||
|     violns = [MaxViolation() for _ in range(len(states))] | ||||
|  |  | |||
|  | @ -280,7 +280,6 @@ cdef cppclass StateC: | |||
| 
 | ||||
|         return n | ||||
| 
 | ||||
| 
 | ||||
|     int n_L(int head) nogil const: | ||||
|         return n_arcs(this._left_arcs, head) | ||||
| 
 | ||||
|  |  | |||
|  | @ -9,7 +9,7 @@ from ...strings cimport hash_string | |||
| from ...structs cimport TokenC | ||||
| from ...tokens.doc cimport Doc, set_children_from_heads | ||||
| from ...tokens.token cimport MISSING_DEP | ||||
| from ...typedefs cimport attr_t, hash_t | ||||
| from ...typedefs cimport attr_t | ||||
| 
 | ||||
| from ...training import split_bilu_label | ||||
| 
 | ||||
|  | @ -68,8 +68,9 @@ cdef struct GoldParseStateC: | |||
|     weight_t pop_cost | ||||
| 
 | ||||
| 
 | ||||
| cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state, | ||||
|         heads, labels, sent_starts) except *: | ||||
| cdef GoldParseStateC create_gold_state( | ||||
|     Pool mem, const StateC* state, heads, labels, sent_starts | ||||
| ) except *: | ||||
|     cdef GoldParseStateC gs | ||||
|     gs.length = len(heads) | ||||
|     gs.stride = 1 | ||||
|  | @ -82,7 +83,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state, | |||
|     gs.n_kids_in_stack = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0])) | ||||
| 
 | ||||
|     for i, is_sent_start in enumerate(sent_starts): | ||||
|         if is_sent_start == True: | ||||
|         if is_sent_start is True: | ||||
|             gs.state_bits[i] = set_state_flag( | ||||
|                 gs.state_bits[i], | ||||
|                 IS_SENT_START, | ||||
|  | @ -210,6 +211,7 @@ cdef class ArcEagerGold: | |||
|     def update(self, StateClass stcls): | ||||
|         update_gold_state(&self.c, stcls.c) | ||||
| 
 | ||||
| 
 | ||||
| def _get_aligned_sent_starts(example): | ||||
|     """Get list of SENT_START attributes aligned to the predicted tokenization. | ||||
|     If the reference has not sentence starts, return a list of None values. | ||||
|  | @ -524,7 +526,6 @@ cdef class Break: | |||
|     """ | ||||
|     @staticmethod | ||||
|     cdef bint is_valid(const StateC* st, attr_t label) nogil: | ||||
|         cdef int i | ||||
|         if st.buffer_length() < 2: | ||||
|             return False | ||||
|         elif st.B(1) != st.B(0) + 1: | ||||
|  | @ -556,8 +557,8 @@ cdef class Break: | |||
|                 cost -= 1 | ||||
|             if gold.heads[si] == b0: | ||||
|                 cost -= 1 | ||||
|         if not is_sent_start(gold, state.B(1)) \ | ||||
|         and not is_sent_start_unknown(gold, state.B(1)): | ||||
|         if not is_sent_start(gold, state.B(1)) and\ | ||||
|                 not is_sent_start_unknown(gold, state.B(1)): | ||||
|             cost += 1 | ||||
|         return cost | ||||
| 
 | ||||
|  | @ -805,7 +806,6 @@ cdef class ArcEager(TransitionSystem): | |||
|             raise TypeError(Errors.E909.format(name="ArcEagerGold")) | ||||
|         cdef ArcEagerGold gold_ = gold | ||||
|         gold_state = gold_.c | ||||
|         n_gold = 0 | ||||
|         if self.c[i].is_valid(stcls.c, self.c[i].label): | ||||
|             cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label) | ||||
|         else: | ||||
|  | @ -878,7 +878,7 @@ cdef class ArcEager(TransitionSystem): | |||
|             print("Gold") | ||||
|             for token in example.y: | ||||
|                 print(token.i, token.text, token.dep_, token.head.text) | ||||
|             aligned_heads, aligned_labels = example.get_aligned_parse() | ||||
|             aligned_heads, _aligned_labels = example.get_aligned_parse() | ||||
|             print("Aligned heads") | ||||
|             for i, head in enumerate(aligned_heads): | ||||
|                 print(example.x[i], example.x[head] if head is not None else "__") | ||||
|  |  | |||
|  | @ -1,8 +1,4 @@ | |||
| import os | ||||
| import random | ||||
| 
 | ||||
| from cymem.cymem cimport Pool | ||||
| from libc.stdint cimport int32_t | ||||
| from libcpp.memory cimport shared_ptr | ||||
| from libcpp.vector cimport vector | ||||
| 
 | ||||
|  | @ -14,7 +10,7 @@ from ...tokens.span import Span | |||
| 
 | ||||
| from ...attrs cimport IS_SPACE | ||||
| from ...lexeme cimport Lexeme | ||||
| from ...structs cimport SpanC, TokenC | ||||
| from ...structs cimport SpanC | ||||
| from ...tokens.span cimport Span | ||||
| from ...typedefs cimport attr_t, weight_t | ||||
| 
 | ||||
|  | @ -138,11 +134,10 @@ cdef class BiluoPushDown(TransitionSystem): | |||
|             OUT: Counter() | ||||
|         } | ||||
|         actions[OUT][''] = 1  # Represents a token predicted to be outside of any entity | ||||
|         actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity | ||||
|         actions[UNIT][''] = 1  # Represents a token prohibited to be in an entity | ||||
|         for entity_type in kwargs.get('entity_types', []): | ||||
|             for action in (BEGIN, IN, LAST, UNIT): | ||||
|                 actions[action][entity_type] = 1 | ||||
|         moves = ('M', 'B', 'I', 'L', 'U') | ||||
|         for example in kwargs.get('examples', []): | ||||
|             for token in example.y: | ||||
|                 ent_type = token.ent_type_ | ||||
|  | @ -324,7 +319,6 @@ cdef class BiluoPushDown(TransitionSystem): | |||
|             raise TypeError(Errors.E909.format(name="BiluoGold")) | ||||
|         cdef BiluoGold gold_ = gold | ||||
|         gold_state = gold_.c | ||||
|         n_gold = 0 | ||||
|         if self.c[i].is_valid(stcls.c, self.c[i].label): | ||||
|             cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label) | ||||
|         else: | ||||
|  | @ -487,10 +481,8 @@ cdef class In: | |||
|     @staticmethod | ||||
|     cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil: | ||||
|         gold = <GoldNERStateC*>_gold | ||||
|         move = IN | ||||
|         cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT | ||||
|         cdef int g_act = gold.ner[s.B(0)].move | ||||
|         cdef attr_t g_tag = gold.ner[s.B(0)].label | ||||
|         cdef bint is_sunk = _entity_is_sunk(s, gold.ner) | ||||
| 
 | ||||
|         if g_act == MISSING: | ||||
|  | @ -550,12 +542,10 @@ cdef class Last: | |||
|     @staticmethod | ||||
|     cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil: | ||||
|         gold = <GoldNERStateC*>_gold | ||||
|         move = LAST | ||||
|         b0 = s.B(0) | ||||
|         ent_start = s.E(0) | ||||
| 
 | ||||
|         cdef int g_act = gold.ner[b0].move | ||||
|         cdef attr_t g_tag = gold.ner[b0].label | ||||
| 
 | ||||
|         cdef int cost = 0 | ||||
| 
 | ||||
|  | @ -655,7 +645,6 @@ cdef class Unit: | |||
|         return cost | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| cdef class Out: | ||||
|     @staticmethod | ||||
|     cdef bint is_valid(const StateC* st, attr_t label) nogil: | ||||
|  | @ -678,7 +667,6 @@ cdef class Out: | |||
|     cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil: | ||||
|         gold = <GoldNERStateC*>_gold | ||||
|         cdef int g_act = gold.ner[s.B(0)].move | ||||
|         cdef attr_t g_tag = gold.ner[s.B(0)].label | ||||
|         cdef weight_t cost = 0 | ||||
|         if g_act == MISSING: | ||||
|             pass | ||||
|  |  | |||
|  | @ -125,14 +125,17 @@ def decompose(label): | |||
| def is_decorated(label): | ||||
|     return DELIMITER in label | ||||
| 
 | ||||
| 
 | ||||
| def count_decorated_labels(gold_data): | ||||
|     freqs = {} | ||||
|     for example in gold_data: | ||||
|         proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"), | ||||
|                                              example.get_aligned("DEP")) | ||||
|         # set the label to ROOT for each root dependent | ||||
|         deco_deps = ['ROOT' if head == i else deco_deps[i] | ||||
|                        for i, head in enumerate(proj_heads)] | ||||
|         deco_deps = [ | ||||
|             'ROOT' if head == i else deco_deps[i] | ||||
|             for i, head in enumerate(proj_heads) | ||||
|         ] | ||||
|         # count label frequencies | ||||
|         for label in deco_deps: | ||||
|             if is_decorated(label): | ||||
|  | @ -160,9 +163,9 @@ def projectivize(heads, labels): | |||
| 
 | ||||
| 
 | ||||
| cdef vector[int] _heads_to_c(heads): | ||||
|     cdef vector[int] c_heads; | ||||
|     cdef vector[int] c_heads | ||||
|     for head in heads: | ||||
|         if head == None: | ||||
|         if head is None: | ||||
|             c_heads.push_back(-1) | ||||
|         else: | ||||
|             assert head < len(heads) | ||||
|  | @ -199,6 +202,7 @@ def _decorate(heads, proj_heads, labels): | |||
|             deco_labels.append(labels[tokenid]) | ||||
|     return deco_labels | ||||
| 
 | ||||
| 
 | ||||
| def get_smallest_nonproj_arc_slow(heads): | ||||
|     cdef vector[int] c_heads = _heads_to_c(heads) | ||||
|     return _get_smallest_nonproj_arc(c_heads) | ||||
|  |  | |||
|  | @ -57,7 +57,6 @@ cdef class Beam: | |||
|     cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func, | ||||
|                      void* extra_args) except -1 | ||||
|     cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1 | ||||
|   | ||||
| 
 | ||||
|     cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil: | ||||
|         self.scores[i][j] = score | ||||
|  |  | |||
|  | @ -1,11 +1,8 @@ | |||
| # cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True | ||||
| cimport cython | ||||
| from libc.math cimport exp, log | ||||
| from libc.string cimport memcpy, memset | ||||
| 
 | ||||
| import math | ||||
| 
 | ||||
| from cymem.cymem cimport Pool | ||||
| from libc.math cimport exp | ||||
| from libc.string cimport memcpy, memset | ||||
| from preshed.maps cimport PreshMap | ||||
| 
 | ||||
| 
 | ||||
|  | @ -70,7 +67,7 @@ cdef class Beam: | |||
|             self.costs[i][j] = costs[j] | ||||
| 
 | ||||
|     cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1: | ||||
|         cdef int i, j | ||||
|         cdef int i | ||||
|         for i in range(self.width): | ||||
|             memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class) | ||||
|             memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class) | ||||
|  | @ -176,7 +173,6 @@ cdef class Beam: | |||
|         beam-width, and n is the number of classes. | ||||
|         """ | ||||
|         cdef Entry entry | ||||
|         cdef weight_t score | ||||
|         cdef _State* s | ||||
|         cdef int i, j, move_id | ||||
|         assert self.size >= 1 | ||||
|  | @ -269,7 +265,7 @@ cdef class MaxViolation: | |||
|                 # This can happen from non-monotonic actions | ||||
|                 # If we find a better gold analysis this way, be sure to keep it. | ||||
|                 elif pred._states[i].loss <= 0 \ | ||||
|                 and tuple(pred.histories[i]) not in seen_golds: | ||||
|                         and tuple(pred.histories[i]) not in seen_golds: | ||||
|                     g_scores.append(pred._states[i].score) | ||||
|                     g_hist.append(list(pred.histories[i])) | ||||
|             for i in range(gold.size): | ||||
|  |  | |||
|  | @ -1,6 +1,4 @@ | |||
| # cython: infer_types=True | ||||
| import numpy | ||||
| 
 | ||||
| from libcpp.vector cimport vector | ||||
| 
 | ||||
| from ...tokens.doc cimport Doc | ||||
|  | @ -42,11 +40,11 @@ cdef class StateClass: | |||
|         cdef vector[ArcC] arcs | ||||
|         self.c.get_arcs(&arcs) | ||||
|         return list(arcs) | ||||
|         #py_arcs = [] | ||||
|         #for arc in arcs: | ||||
|         #    if arc.head != -1 and arc.child != -1: | ||||
|         #        py_arcs.append((arc.head, arc.child, arc.label)) | ||||
|         #return arcs | ||||
|         # py_arcs = [] | ||||
|         # for arc in arcs: | ||||
|         #     if arc.head != -1 and arc.child != -1: | ||||
|         #         py_arcs.append((arc.head, arc.child, arc.label)) | ||||
|         # return arcs | ||||
| 
 | ||||
|     def add_arc(self, int head, int child, int label): | ||||
|         self.c.add_arc(head, child, label) | ||||
|  | @ -56,10 +54,10 @@ cdef class StateClass: | |||
| 
 | ||||
|     def H(self, int child): | ||||
|         return self.c.H(child) | ||||
|      | ||||
| 
 | ||||
|     def L(self, int head, int idx): | ||||
|         return self.c.L(head, idx) | ||||
|      | ||||
| 
 | ||||
|     def R(self, int head, int idx): | ||||
|         return self.c.R(head, idx) | ||||
| 
 | ||||
|  | @ -102,7 +100,7 @@ cdef class StateClass: | |||
| 
 | ||||
|     def H(self, int i): | ||||
|         return self.c.H(i) | ||||
|      | ||||
| 
 | ||||
|     def E(self, int i): | ||||
|         return self.c.E(i) | ||||
| 
 | ||||
|  | @ -120,7 +118,7 @@ cdef class StateClass: | |||
| 
 | ||||
|     def H_(self, int i): | ||||
|         return self.doc[self.c.H(i)] | ||||
|      | ||||
| 
 | ||||
|     def E_(self, int i): | ||||
|         return self.doc[self.c.E(i)] | ||||
| 
 | ||||
|  | @ -129,7 +127,7 @@ cdef class StateClass: | |||
| 
 | ||||
|     def R_(self, int i, int idx): | ||||
|         return self.doc[self.c.R(i, idx)] | ||||
|   | ||||
| 
 | ||||
|     def empty(self): | ||||
|         return self.c.empty() | ||||
| 
 | ||||
|  | @ -138,7 +136,7 @@ cdef class StateClass: | |||
| 
 | ||||
|     def at_break(self): | ||||
|         return False | ||||
|         #return self.c.at_break() | ||||
|         # return self.c.at_break() | ||||
| 
 | ||||
|     def has_head(self, int i): | ||||
|         return self.c.has_head(i) | ||||
|  |  | |||
|  | @ -20,11 +20,15 @@ cdef struct Transition: | |||
|     int (*do)(StateC* state, attr_t label) nogil | ||||
| 
 | ||||
| 
 | ||||
| ctypedef weight_t (*get_cost_func_t)(const StateC* state, const void* gold, | ||||
|         attr_tlabel) nogil | ||||
| ctypedef weight_t (*move_cost_func_t)(const StateC* state, const void* gold) nogil | ||||
| ctypedef weight_t (*label_cost_func_t)(const StateC* state, const void* | ||||
|         gold, attr_t label) nogil | ||||
| ctypedef weight_t (*get_cost_func_t)( | ||||
|     const StateC* state, const void* gold, attr_tlabel | ||||
| ) nogil | ||||
| ctypedef weight_t (*move_cost_func_t)( | ||||
|         const StateC* state, const void* gold | ||||
| ) nogil | ||||
| ctypedef weight_t (*label_cost_func_t)( | ||||
|     const StateC* state, const void* gold, attr_t label | ||||
| ) nogil | ||||
| 
 | ||||
| ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil | ||||
| 
 | ||||
|  | @ -56,7 +60,7 @@ cdef class TransitionSystem: | |||
| 
 | ||||
| 
 | ||||
| cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions, | ||||
|     int batch_size) nogil | ||||
|                           int batch_size) nogil | ||||
| 
 | ||||
| cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores, | ||||
|         int nr_class, int batch_size) nogil | ||||
|                              int nr_class, int batch_size) nogil | ||||
|  |  | |||
|  | @ -10,9 +10,7 @@ from collections import Counter | |||
| import srsly | ||||
| 
 | ||||
| from ...structs cimport TokenC | ||||
| from ...tokens.doc cimport Doc | ||||
| from ...typedefs cimport attr_t, weight_t | ||||
| from . cimport _beam_utils | ||||
| from ._parser_utils cimport arg_max_if_valid | ||||
| from .stateclass cimport StateClass | ||||
| 
 | ||||
|  | @ -270,7 +268,6 @@ cdef class TransitionSystem: | |||
|         return self | ||||
| 
 | ||||
|     def to_bytes(self, exclude=tuple()): | ||||
|         transitions = [] | ||||
|         serializers = { | ||||
|             'moves': lambda: srsly.json_dumps(self.labels), | ||||
|             'strings': lambda: self.strings.to_bytes(), | ||||
|  | @ -294,19 +291,19 @@ cdef class TransitionSystem: | |||
| 
 | ||||
| 
 | ||||
| cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions, | ||||
|     int batch_size) nogil: | ||||
|         cdef int i | ||||
|         cdef Transition action | ||||
|         cdef StateC* state | ||||
|         for i in range(batch_size): | ||||
|             state = states[i] | ||||
|             action = moves.c[actions[i]] | ||||
|             action.do(state, action.label) | ||||
|             state.history.push_back(action.clas) | ||||
|                           int batch_size) nogil: | ||||
|     cdef int i | ||||
|     cdef Transition action | ||||
|     cdef StateC* state | ||||
|     for i in range(batch_size): | ||||
|         state = states[i] | ||||
|         action = moves.c[actions[i]] | ||||
|         action.do(state, action.label) | ||||
|         state.history.push_back(action.clas) | ||||
| 
 | ||||
| 
 | ||||
| cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores, | ||||
|     int nr_class, int batch_size) nogil: | ||||
|                              int nr_class, int batch_size) nogil: | ||||
|     is_valid = <int*>calloc(moves.n_moves, sizeof(int)) | ||||
|     cdef int i, guess | ||||
|     cdef Transition action | ||||
|  | @ -322,4 +319,3 @@ cdef void c_transition_batch(TransitionSystem moves, StateC** states, const floa | |||
|             action.do(states[i], action.label) | ||||
|             states[i].history.push_back(guess) | ||||
|     free(is_valid) | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| # cython: infer_types=True, profile=True, binding=True | ||||
| from collections import defaultdict | ||||
| from typing import Callable, Iterable, Optional | ||||
| from typing import Callable, Optional | ||||
| 
 | ||||
| from thinc.api import Config, Model | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,11 +1,9 @@ | |||
| # cython: infer_types=True, profile=True, binding=True | ||||
| from itertools import islice | ||||
| from typing import Callable, Dict, Iterable, List, Optional, Union | ||||
| from typing import Callable, Dict, Iterable, Optional, Union | ||||
| 
 | ||||
| import srsly | ||||
| from thinc.api import Config, Model | ||||
| from thinc.legacy import LegacySequenceCategoricalCrossentropy | ||||
| from thinc.types import Floats2d, Ints1d | ||||
| 
 | ||||
| from ..morphology cimport Morphology | ||||
| from ..tokens.doc cimport Doc | ||||
|  | @ -16,10 +14,8 @@ from ..errors import Errors | |||
| from ..language import Language | ||||
| from ..parts_of_speech import IDS as POS_IDS | ||||
| from ..scorer import Scorer | ||||
| from ..symbols import POS | ||||
| from ..training import validate_examples, validate_get_examples | ||||
| from ..util import registry | ||||
| from .pipe import deserialize_config | ||||
| from .tagger import ActivationsT, Tagger | ||||
| 
 | ||||
| # See #9050 | ||||
|  | @ -86,8 +82,11 @@ def morphologizer_score(examples, **kwargs): | |||
|     results = {} | ||||
|     results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) | ||||
|     results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)) | ||||
|     results.update(Scorer.score_token_attr_per_feat(examples, | ||||
|         "morph", getter=morph_key_getter, **kwargs)) | ||||
|     results.update( | ||||
|         Scorer.score_token_attr_per_feat( | ||||
|             examples, "morph", getter=morph_key_getter, **kwargs | ||||
|         ) | ||||
|     ) | ||||
|     return results | ||||
| 
 | ||||
| 
 | ||||
|  | @ -249,7 +248,6 @@ class Morphologizer(Tagger): | |||
|         if isinstance(docs, Doc): | ||||
|             docs = [docs] | ||||
|         cdef Doc doc | ||||
|         cdef Vocab vocab = self.vocab | ||||
|         cdef bint overwrite = self.cfg["overwrite"] | ||||
|         cdef bint extend = self.cfg["extend"] | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,12 +1,12 @@ | |||
| # cython: infer_types=True, profile=True, binding=True | ||||
| from collections import defaultdict | ||||
| from typing import Callable, Iterable, Optional | ||||
| from typing import Callable, Optional | ||||
| 
 | ||||
| from thinc.api import Config, Model | ||||
| 
 | ||||
| from ..language import Language | ||||
| from ..scorer import PRFScore, get_ner_prf | ||||
| from ..training import remove_bilu_prefix, validate_examples | ||||
| from ..scorer import get_ner_prf | ||||
| from ..training import remove_bilu_prefix | ||||
| from ..util import registry | ||||
| from ._parser_internals.ner import BiluoPushDown | ||||
| from ._parser_internals.transition_system import TransitionSystem | ||||
|  |  | |||
|  | @ -1,12 +1,11 @@ | |||
| # cython: infer_types=True, profile=True, binding=True | ||||
| import warnings | ||||
| from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple, Union | ||||
| from typing import Callable, Dict, Iterable, Iterator, Tuple, Union | ||||
| 
 | ||||
| import srsly | ||||
| 
 | ||||
| from ..tokens.doc cimport Doc | ||||
| 
 | ||||
| from ..errors import Errors, Warnings | ||||
| from ..errors import Errors | ||||
| from ..language import Language | ||||
| from ..training import Example | ||||
| from ..util import raise_error | ||||
|  | @ -33,7 +32,7 @@ cdef class Pipe: | |||
|         """ | ||||
|         raise NotImplementedError(Errors.E931.format(parent="Pipe", method="__call__", name=self.name)) | ||||
| 
 | ||||
|     def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]: | ||||
|     def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: | ||||
|         """Apply the pipe to a stream of documents. This usually happens under | ||||
|         the hood when the nlp object is called on a text and all components are | ||||
|         applied to the Doc. | ||||
|  | @ -52,7 +51,7 @@ cdef class Pipe: | |||
|             except Exception as e: | ||||
|                 error_handler(self.name, self, [doc], e) | ||||
| 
 | ||||
|     def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None): | ||||
|     def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language = None): | ||||
|         """Initialize the pipe. For non-trainable components, this method | ||||
|         is optional. For trainable components, which should inherit | ||||
|         from the subclass TrainablePipe, the provided data examples | ||||
|  |  | |||
|  | @ -7,7 +7,6 @@ from ..tokens.doc cimport Doc | |||
| 
 | ||||
| from .. import util | ||||
| from ..language import Language | ||||
| from ..scorer import Scorer | ||||
| from .pipe import Pipe | ||||
| from .senter import senter_score | ||||
| 
 | ||||
|  | @ -34,17 +33,19 @@ class Sentencizer(Pipe): | |||
|     DOCS: https://spacy.io/api/sentencizer | ||||
|     """ | ||||
| 
 | ||||
|     default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', | ||||
|             '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', | ||||
|             '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', | ||||
|             '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', | ||||
|             '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', | ||||
|             '﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', | ||||
|             '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', | ||||
|             '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', | ||||
|             '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', | ||||
|             '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', | ||||
|             '。', '。'] | ||||
|     default_punct_chars = [ | ||||
|         '!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', | ||||
|         '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', | ||||
|         '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', | ||||
|         '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', | ||||
|         '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', | ||||
|         '﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', | ||||
|         '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', | ||||
|         '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', | ||||
|         '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', | ||||
|         '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', | ||||
|         '。', '。' | ||||
|     ] | ||||
| 
 | ||||
|     def __init__( | ||||
|         self, | ||||
|  | @ -127,7 +128,6 @@ class Sentencizer(Pipe): | |||
|         if isinstance(docs, Doc): | ||||
|             docs = [docs] | ||||
|         cdef Doc doc | ||||
|         cdef int idx = 0 | ||||
|         for i, doc in enumerate(docs): | ||||
|             doc_tag_ids = batch_tag_ids[i] | ||||
|             for j, tag_id in enumerate(doc_tag_ids): | ||||
|  | @ -168,7 +168,6 @@ class Sentencizer(Pipe): | |||
|         path = path.with_suffix(".json") | ||||
|         srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite}) | ||||
| 
 | ||||
| 
 | ||||
|     def from_disk(self, path, *, exclude=tuple()): | ||||
|         """Load the sentencizer from disk. | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,11 +1,9 @@ | |||
| # cython: infer_types=True, profile=True, binding=True | ||||
| from itertools import islice | ||||
| from typing import Callable, Dict, Iterable, List, Optional, Union | ||||
| from typing import Callable, Iterable, Optional | ||||
| 
 | ||||
| import srsly | ||||
| from thinc.api import Config, Model | ||||
| from thinc.legacy import LegacySequenceCategoricalCrossentropy | ||||
| from thinc.types import Floats2d, Ints1d | ||||
| 
 | ||||
| from ..tokens.doc cimport Doc | ||||
| 
 | ||||
|  |  | |||
|  | @ -48,14 +48,14 @@ DEFAULT_SPAN_FINDER_MODEL = Config().from_str(span_finder_default_config)["model | |||
|         "threshold": 0.5, | ||||
|         "model": DEFAULT_SPAN_FINDER_MODEL, | ||||
|         "spans_key": DEFAULT_SPANS_KEY, | ||||
|         "max_length": None, | ||||
|         "max_length": 25, | ||||
|         "min_length": None, | ||||
|         "scorer": {"@scorers": "spacy.span_finder_scorer.v1"}, | ||||
|     }, | ||||
|     default_score_weights={ | ||||
|         f"span_finder_{DEFAULT_SPANS_KEY}_f": 1.0, | ||||
|         f"span_finder_{DEFAULT_SPANS_KEY}_p": 0.0, | ||||
|         f"span_finder_{DEFAULT_SPANS_KEY}_r": 0.0, | ||||
|         f"spans_{DEFAULT_SPANS_KEY}_f": 1.0, | ||||
|         f"spans_{DEFAULT_SPANS_KEY}_p": 0.0, | ||||
|         f"spans_{DEFAULT_SPANS_KEY}_r": 0.0, | ||||
|     }, | ||||
| ) | ||||
| def make_span_finder( | ||||
|  | @ -104,7 +104,7 @@ def make_span_finder_scorer(): | |||
| 
 | ||||
| def span_finder_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: | ||||
|     kwargs = dict(kwargs) | ||||
|     attr_prefix = "span_finder_" | ||||
|     attr_prefix = "spans_" | ||||
|     key = kwargs["spans_key"] | ||||
|     kwargs.setdefault("attr", f"{attr_prefix}{key}") | ||||
|     kwargs.setdefault( | ||||
|  |  | |||
|  | @ -1,27 +1,20 @@ | |||
| # cython: infer_types=True, profile=True, binding=True | ||||
| import warnings | ||||
| from itertools import islice | ||||
| from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union | ||||
| 
 | ||||
| import numpy | ||||
| import srsly | ||||
| from thinc.api import Config, Model, set_dropout_rate | ||||
| from thinc.legacy import LegacySequenceCategoricalCrossentropy | ||||
| from thinc.types import Floats2d, Ints1d | ||||
| 
 | ||||
| from ..morphology cimport Morphology | ||||
| from ..tokens.doc cimport Doc | ||||
| from ..vocab cimport Vocab | ||||
| 
 | ||||
| from .. import util | ||||
| from ..attrs import ID, POS | ||||
| from ..errors import Errors, Warnings | ||||
| from ..errors import Errors | ||||
| from ..language import Language | ||||
| from ..parts_of_speech import X | ||||
| from ..scorer import Scorer | ||||
| from ..training import validate_examples, validate_get_examples | ||||
| from ..util import registry | ||||
| from .pipe import deserialize_config | ||||
| from .trainable_pipe import TrainablePipe | ||||
| 
 | ||||
| ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]] | ||||
|  | @ -188,7 +181,6 @@ class Tagger(TrainablePipe): | |||
|         if isinstance(docs, Doc): | ||||
|             docs = [docs] | ||||
|         cdef Doc doc | ||||
|         cdef Vocab vocab = self.vocab | ||||
|         cdef bint overwrite = self.cfg["overwrite"] | ||||
|         labels = self.labels | ||||
|         for i, doc in enumerate(docs): | ||||
|  | @ -281,7 +273,7 @@ class Tagger(TrainablePipe): | |||
|         student_scores: Scores representing the student model's predictions. | ||||
| 
 | ||||
|         RETURNS (Tuple[float, float]): The loss and the gradient. | ||||
|          | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/tagger#get_teacher_student_loss | ||||
|         """ | ||||
|         loss_func = LegacySequenceCategoricalCrossentropy(normalize=False) | ||||
|  |  | |||
|  | @ -1,5 +1,4 @@ | |||
| # cython: infer_types=True, profile=True, binding=True | ||||
| import warnings | ||||
| from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple | ||||
| 
 | ||||
| import srsly | ||||
|  | @ -8,7 +7,7 @@ from thinc.api import Model, Optimizer, set_dropout_rate | |||
| from ..tokens.doc cimport Doc | ||||
| 
 | ||||
| from .. import util | ||||
| from ..errors import Errors, Warnings | ||||
| from ..errors import Errors | ||||
| from ..language import Language | ||||
| from ..training import Example, validate_distillation_examples, validate_examples | ||||
| from ..vocab import Vocab | ||||
|  | @ -56,14 +55,14 @@ cdef class TrainablePipe(Pipe): | |||
|         except Exception as e: | ||||
|             error_handler(self.name, self, [doc], e) | ||||
| 
 | ||||
| 
 | ||||
|     def distill(self, | ||||
|                teacher_pipe: Optional["TrainablePipe"], | ||||
|                examples: Iterable["Example"], | ||||
|                *, | ||||
|                drop: float=0.0, | ||||
|                sgd: Optional[Optimizer]=None, | ||||
|                losses: Optional[Dict[str, float]]=None) -> Dict[str, float]: | ||||
|                 teacher_pipe: Optional["TrainablePipe"], | ||||
|                 examples: Iterable["Example"], | ||||
|                 *, | ||||
|                 drop: float = 0.0, | ||||
|                 sgd: Optional[Optimizer] = None, | ||||
|                 losses: Optional[Dict[str, float]] = None | ||||
|                 ) -> Dict[str, float]: | ||||
|         """Train a pipe (the student) on the predictions of another pipe | ||||
|         (the teacher). The student is typically trained on the probability | ||||
|         distribution of the teacher, but details may differ per pipe. | ||||
|  | @ -79,7 +78,7 @@ cdef class TrainablePipe(Pipe): | |||
|         losses (Optional[Dict[str, float]]): Optional record of loss during | ||||
|             distillation. | ||||
|         RETURNS: The updated losses dictionary. | ||||
|          | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/pipe#distill | ||||
|         """ | ||||
|         # By default we require a teacher pipe, but there are downstream | ||||
|  | @ -103,7 +102,7 @@ cdef class TrainablePipe(Pipe): | |||
|         losses[self.name] += loss | ||||
|         return losses | ||||
| 
 | ||||
|     def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]: | ||||
|     def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: | ||||
|         """Apply the pipe to a stream of documents. This usually happens under | ||||
|         the hood when the nlp object is called on a text and all components are | ||||
|         applied to the Doc. | ||||
|  | @ -150,9 +149,9 @@ cdef class TrainablePipe(Pipe): | |||
|     def update(self, | ||||
|                examples: Iterable["Example"], | ||||
|                *, | ||||
|                drop: float=0.0, | ||||
|                sgd: Optimizer=None, | ||||
|                losses: Optional[Dict[str, float]]=None) -> Dict[str, float]: | ||||
|                drop: float = 0.0, | ||||
|                sgd: Optimizer = None, | ||||
|                losses: Optional[Dict[str, float]] = None) -> Dict[str, float]: | ||||
|         """Learn from a batch of documents and gold-standard information, | ||||
|         updating the pipe's model. Delegates to predict and get_loss. | ||||
| 
 | ||||
|  | @ -186,8 +185,8 @@ cdef class TrainablePipe(Pipe): | |||
|     def rehearse(self, | ||||
|                  examples: Iterable[Example], | ||||
|                  *, | ||||
|                  sgd: Optimizer=None, | ||||
|                  losses: Dict[str, float]=None, | ||||
|                  sgd: Optimizer = None, | ||||
|                  losses: Dict[str, float] = None, | ||||
|                  **config) -> Dict[str, float]: | ||||
|         """Perform a "rehearsal" update from a batch of data. Rehearsal updates | ||||
|         teach the current model to make predictions similar to an initial model, | ||||
|  | @ -224,7 +223,7 @@ cdef class TrainablePipe(Pipe): | |||
|         student_scores: Scores representing the student model's predictions. | ||||
| 
 | ||||
|         RETURNS (Tuple[float, float]): The loss and the gradient. | ||||
|          | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/pipe#get_teacher_student_loss | ||||
|         """ | ||||
|         raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_teacher_student_loss", name=self.name)) | ||||
|  | @ -238,7 +237,7 @@ cdef class TrainablePipe(Pipe): | |||
|         """ | ||||
|         return util.create_default_optimizer() | ||||
| 
 | ||||
|     def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None): | ||||
|     def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language = None): | ||||
|         """Initialize the pipe for training, using data examples if available. | ||||
|         This method needs to be implemented by each TrainablePipe component, | ||||
|         ensuring the internal model (if available) is initialized properly | ||||
|  |  | |||
|  | @ -6,15 +6,9 @@ from typing import Dict, Iterable, List, Optional, Tuple | |||
| cimport numpy as np | ||||
| from cymem.cymem cimport Pool | ||||
| 
 | ||||
| from itertools import islice | ||||
| 
 | ||||
| from libc.stdlib cimport calloc, free | ||||
| from libc.string cimport memcpy, memset | ||||
| from libcpp.vector cimport vector | ||||
| 
 | ||||
| import contextlib | ||||
| import random | ||||
| import warnings | ||||
| from itertools import islice | ||||
| 
 | ||||
| import numpy | ||||
| import numpy.random | ||||
|  | @ -23,44 +17,36 @@ from thinc.api import ( | |||
|     CupyOps, | ||||
|     NumpyOps, | ||||
|     Optimizer, | ||||
|     chain, | ||||
|     get_array_module, | ||||
|     get_ops, | ||||
|     set_dropout_rate, | ||||
|     softmax_activation, | ||||
|     use_ops, | ||||
| ) | ||||
| from thinc.legacy import LegacySequenceCategoricalCrossentropy | ||||
| from thinc.types import Floats2d, Ints1d | ||||
| 
 | ||||
| from ..ml.tb_framework import TransitionModelInputs | ||||
| 
 | ||||
| from ..tokens.doc cimport Doc | ||||
| from ._parser_internals cimport _beam_utils | ||||
| from ._parser_internals.search cimport Beam | ||||
| from ._parser_internals.stateclass cimport StateC, StateClass | ||||
| from .trainable_pipe cimport TrainablePipe | ||||
| 
 | ||||
| from ._parser_internals import _beam_utils | ||||
| 
 | ||||
| from ..typedefs cimport weight_t | ||||
| from ..vocab cimport Vocab | ||||
| from ._parser_internals cimport _beam_utils | ||||
| from ._parser_internals.stateclass cimport StateC, StateClass | ||||
| from ._parser_internals.transition_system cimport Transition, TransitionSystem | ||||
| from .trainable_pipe cimport TrainablePipe | ||||
| 
 | ||||
| from .. import util | ||||
| from ..errors import Errors, Warnings | ||||
| from ..errors import Errors | ||||
| from ..training import ( | ||||
|     validate_distillation_examples, | ||||
|     validate_examples, | ||||
|     validate_get_examples, | ||||
| ) | ||||
| from ._parser_internals import _beam_utils | ||||
| 
 | ||||
| 
 | ||||
| # TODO: Remove when we switch to Cython 3. | ||||
| cdef extern from "<algorithm>" namespace "std" nogil: | ||||
|     bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except + | ||||
| 
 | ||||
| 
 | ||||
| NUMPY_OPS = NumpyOps() | ||||
| 
 | ||||
| 
 | ||||
|  | @ -236,12 +222,13 @@ class Parser(TrainablePipe): | |||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def distill(self, | ||||
|                teacher_pipe: Optional[TrainablePipe], | ||||
|                examples: Iterable["Example"], | ||||
|                *, | ||||
|                drop: float=0.0, | ||||
|                sgd: Optional[Optimizer]=None, | ||||
|                losses: Optional[Dict[str, float]]=None): | ||||
|                 teacher_pipe: Optional[TrainablePipe], | ||||
|                 examples: Iterable["Example"], | ||||
|                 *, | ||||
|                 drop: float = 0.0, | ||||
|                 sgd: Optional[Optimizer] = None, | ||||
|                 losses: Optional[Dict[str, float]] = None | ||||
|                 ): | ||||
|         """Train a pipe (the student) on the predictions of another pipe | ||||
|         (the teacher). The student is trained on the transition probabilities | ||||
|         of the teacher. | ||||
|  | @ -257,7 +244,7 @@ class Parser(TrainablePipe): | |||
|         losses (Optional[Dict[str, float]]): Optional record of loss during | ||||
|             distillation. | ||||
|         RETURNS: The updated losses dictionary. | ||||
|          | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/dependencyparser#distill | ||||
|         """ | ||||
|         if teacher_pipe is None: | ||||
|  | @ -291,11 +278,13 @@ class Parser(TrainablePipe): | |||
|         # teacher's distributions. | ||||
| 
 | ||||
|         student_inputs = TransitionModelInputs(docs=student_docs, | ||||
|             states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves) | ||||
|                                                states=[state.copy() for state in states], | ||||
|                                                moves=self.moves, | ||||
|                                                max_moves=max_moves) | ||||
|         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs) | ||||
|         actions = _states_diff_to_actions(states, student_states) | ||||
|         teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples], | ||||
|             states=states, moves=teacher_pipe.moves, actions=actions) | ||||
|                                                states=states, moves=teacher_pipe.moves, actions=actions) | ||||
|         (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs) | ||||
| 
 | ||||
|         loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores) | ||||
|  | @ -308,10 +297,9 @@ class Parser(TrainablePipe): | |||
| 
 | ||||
|         return losses | ||||
| 
 | ||||
| 
 | ||||
|     def get_teacher_student_loss( | ||||
|         self, teacher_scores: List[Floats2d], student_scores: List[Floats2d], | ||||
|         normalize: bool=False, | ||||
|             self, teacher_scores: List[Floats2d], student_scores: List[Floats2d], | ||||
|             normalize: bool = False, | ||||
|     ) -> Tuple[float, List[Floats2d]]: | ||||
|         """Calculate the loss and its gradient for a batch of student | ||||
|         scores, relative to teacher scores. | ||||
|  | @ -320,7 +308,7 @@ class Parser(TrainablePipe): | |||
|         student_scores: Scores representing the student model's predictions. | ||||
| 
 | ||||
|         RETURNS (Tuple[float, float]): The loss and the gradient. | ||||
|          | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss | ||||
|         """ | ||||
| 
 | ||||
|  | @ -334,9 +322,9 @@ class Parser(TrainablePipe): | |||
|         # ourselves. | ||||
| 
 | ||||
|         teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores), | ||||
|             axis=-1, inplace=True) | ||||
|                                                 axis=-1, inplace=True) | ||||
|         student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores), | ||||
|             axis=-1, inplace=True) | ||||
|                                                 axis=-1, inplace=True) | ||||
| 
 | ||||
|         assert teacher_scores.shape == student_scores.shape | ||||
| 
 | ||||
|  | @ -384,7 +372,6 @@ class Parser(TrainablePipe): | |||
|             except Exception as e: | ||||
|                 error_handler(self.name, self, batch_in_order, e) | ||||
| 
 | ||||
| 
 | ||||
|     def predict(self, docs): | ||||
|         if isinstance(docs, Doc): | ||||
|             docs = [docs] | ||||
|  | @ -414,7 +401,6 @@ class Parser(TrainablePipe): | |||
| 
 | ||||
|     def set_annotations(self, docs, states_or_beams): | ||||
|         cdef StateClass state | ||||
|         cdef Beam beam | ||||
|         cdef Doc doc | ||||
|         states = _beam_utils.collect_states(states_or_beams, docs) | ||||
|         for i, (state, doc) in enumerate(zip(states, docs)): | ||||
|  | @ -423,7 +409,6 @@ class Parser(TrainablePipe): | |||
|                 hook(doc) | ||||
| 
 | ||||
|     def update(self, examples, *, drop=0., sgd=None, losses=None): | ||||
|         cdef StateClass state | ||||
|         if losses is None: | ||||
|             losses = {} | ||||
|         losses.setdefault(self.name, 0.) | ||||
|  | @ -453,13 +438,15 @@ class Parser(TrainablePipe): | |||
|         else: | ||||
|             init_states, gold_states, _ = self.moves.init_gold_batch(examples) | ||||
| 
 | ||||
|         inputs = TransitionModelInputs(docs=docs, moves=self.moves, | ||||
|             max_moves=max_moves, states=[state.copy() for state in init_states]) | ||||
|         inputs = TransitionModelInputs(docs=docs, | ||||
|                                        moves=self.moves, | ||||
|                                        max_moves=max_moves, | ||||
|                                        states=[state.copy() for state in init_states]) | ||||
|         (pred_states, scores), backprop_scores = self.model.begin_update(inputs) | ||||
|         if sum(s.shape[0] for s in scores) == 0: | ||||
|             return losses | ||||
|         d_scores = self.get_loss((gold_states, init_states, pred_states, scores), | ||||
|             examples, max_moves) | ||||
|                                  examples, max_moves) | ||||
|         backprop_scores((pred_states, d_scores)) | ||||
|         if sgd not in (None, False): | ||||
|             self.finish_update(sgd) | ||||
|  | @ -500,9 +487,7 @@ class Parser(TrainablePipe): | |||
|         cdef TransitionSystem moves = self.moves | ||||
|         cdef StateClass state | ||||
|         cdef int clas | ||||
|         cdef int nF = self.model.get_dim("nF") | ||||
|         cdef int nO = moves.n_moves | ||||
|         cdef int nS = sum([len(history) for history in histories]) | ||||
|         cdef Pool mem = Pool() | ||||
|         cdef np.ndarray costs_i | ||||
|         is_valid = <int*>mem.alloc(nO, sizeof(int)) | ||||
|  | @ -569,8 +554,8 @@ class Parser(TrainablePipe): | |||
| 
 | ||||
|         return losses | ||||
| 
 | ||||
|     def update_beam(self, examples, *, beam_width, | ||||
|             drop=0., sgd=None, losses=None, beam_density=0.0): | ||||
|     def update_beam(self, examples, *, beam_width, drop=0., | ||||
|                     sgd=None, losses=None, beam_density=0.0): | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def set_output(self, nO): | ||||
|  | @ -695,9 +680,10 @@ class Parser(TrainablePipe): | |||
|             return states | ||||
| 
 | ||||
|         # Parse the states that are too long with the teacher's parsing model. | ||||
|         teacher_inputs = TransitionModelInputs(docs=docs, moves=moves, | ||||
|             states=[state.copy() for state in to_cut]) | ||||
|         (teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs) | ||||
|         teacher_inputs = TransitionModelInputs(docs=docs, | ||||
|                                                moves=moves, | ||||
|                                                states=[state.copy() for state in to_cut]) | ||||
|         (teacher_states, _) = teacher_pipe.model.predict(teacher_inputs) | ||||
| 
 | ||||
|         # Step through the teacher's actions and store every state after | ||||
|         # each multiple of max_length. | ||||
|  | @ -795,6 +781,7 @@ def _states_to_actions(states: List[StateClass]) -> List[Ints1d]: | |||
| 
 | ||||
|     return actions | ||||
| 
 | ||||
| 
 | ||||
| def _states_diff_to_actions( | ||||
|     before_states: List[StateClass], | ||||
|     after_states: List[StateClass] | ||||
|  | @ -815,8 +802,9 @@ def _states_diff_to_actions( | |||
|         c_state_before = before_state.c | ||||
|         c_state_after = after_state.c | ||||
| 
 | ||||
|         assert equal(c_state_before.history.begin(), c_state_before.history.end(), | ||||
|             c_state_after.history.begin()) | ||||
|         assert equal(c_state_before.history.begin(), | ||||
|                      c_state_before.history.end(), | ||||
|                      c_state_after.history.begin()) | ||||
| 
 | ||||
|     actions = [] | ||||
|     while True: | ||||
|  |  | |||
|  | @ -1,10 +1,8 @@ | |||
| # cython: infer_types=True | ||||
| from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Union | ||||
| from typing import Iterable, Iterator, List, Optional, Tuple, Union | ||||
| 
 | ||||
| cimport cython | ||||
| from libc.stdint cimport uint32_t | ||||
| from libc.string cimport memcpy | ||||
| from libcpp.set cimport set | ||||
| from murmurhash.mrmr cimport hash64 | ||||
| 
 | ||||
| import srsly | ||||
|  | @ -244,7 +242,6 @@ cdef class StringStore: | |||
|         cdef int n_length_bytes | ||||
|         cdef int i | ||||
|         cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str)) | ||||
|         cdef uint32_t ulength = length | ||||
|         if length < sizeof(string.s): | ||||
|             string.s[0] = <unsigned char>length | ||||
|             memcpy(&string.s[1], chars, length) | ||||
|  | @ -302,7 +299,7 @@ cpdef hash_t get_string_id(object string_or_hash) except -1: | |||
| 
 | ||||
|     try: | ||||
|         return hash_string(string_or_hash) | ||||
|     except: | ||||
|     except:   # no-cython-lint | ||||
|         if _try_coerce_to_hash(string_or_hash, &str_hash): | ||||
|             # Coerce the integral key to the expected primitive hash type. | ||||
|             # This ensures that custom/overloaded "primitive" data types | ||||
|  | @ -319,6 +316,5 @@ cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash): | |||
|     try: | ||||
|         out_hash[0] = key | ||||
|         return True | ||||
|     except: | ||||
|     except:  # no-cython-lint | ||||
|         return False | ||||
| 
 | ||||
|  |  | |||
|  | @ -52,7 +52,7 @@ cdef struct TokenC: | |||
| 
 | ||||
|     int sent_start | ||||
|     int ent_iob | ||||
|     attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. | ||||
|     attr_t ent_type  # TODO: Is there a better way to do this? Multiple sources of truth.. | ||||
|     attr_t ent_kb_id | ||||
|     hash_t ent_id | ||||
| 
 | ||||
|  |  | |||
|  | @ -93,7 +93,7 @@ cdef enum symbol_t: | |||
|     ADV | ||||
|     AUX | ||||
|     CONJ | ||||
|     CCONJ # U20 | ||||
|     CCONJ  # U20 | ||||
|     DET | ||||
|     INTJ | ||||
|     NOUN | ||||
|  | @ -419,7 +419,7 @@ cdef enum symbol_t: | |||
|     ccomp | ||||
|     complm | ||||
|     conj | ||||
|     cop # U20 | ||||
|     cop  # U20 | ||||
|     csubj | ||||
|     csubjpass | ||||
|     dep | ||||
|  | @ -442,8 +442,8 @@ cdef enum symbol_t: | |||
|     num | ||||
|     number | ||||
|     oprd | ||||
|     obj # U20 | ||||
|     obl # U20 | ||||
|     obj  # U20 | ||||
|     obl  # U20 | ||||
|     parataxis | ||||
|     partmod | ||||
|     pcomp | ||||
|  |  | |||
|  | @ -96,7 +96,7 @@ IDS = { | |||
|     "ADV": ADV, | ||||
|     "AUX": AUX, | ||||
|     "CONJ": CONJ, | ||||
|     "CCONJ": CCONJ, # U20 | ||||
|     "CCONJ": CCONJ,  # U20 | ||||
|     "DET": DET, | ||||
|     "INTJ": INTJ, | ||||
|     "NOUN": NOUN, | ||||
|  | @ -421,7 +421,7 @@ IDS = { | |||
|     "ccomp": ccomp, | ||||
|     "complm": complm, | ||||
|     "conj": conj, | ||||
|     "cop": cop, # U20 | ||||
|     "cop": cop,  # U20 | ||||
|     "csubj": csubj, | ||||
|     "csubjpass": csubjpass, | ||||
|     "dep": dep, | ||||
|  | @ -444,8 +444,8 @@ IDS = { | |||
|     "num": num, | ||||
|     "number": number, | ||||
|     "oprd": oprd, | ||||
|     "obj": obj, # U20 | ||||
|     "obl": obl, # U20 | ||||
|     "obj": obj,  # U20 | ||||
|     "obl": obl,  # U20 | ||||
|     "parataxis": parataxis, | ||||
|     "partmod": partmod, | ||||
|     "pcomp": pcomp, | ||||
|  |  | |||
|  | @ -52,7 +52,8 @@ TEST_PATTERNS = [ | |||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "pattern", [[{"XX": "y"}, {"LENGTH": "2"}, {"TEXT": {"IN": 5}}]] | ||||
|     "pattern", | ||||
|     [[{"XX": "y"}], [{"LENGTH": "2"}], [{"TEXT": {"IN": 5}}], [{"text": {"in": 6}}]], | ||||
| ) | ||||
| def test_matcher_pattern_validation(en_vocab, pattern): | ||||
|     matcher = Matcher(en_vocab, validate=True) | ||||
|  |  | |||
|  | @ -11,6 +11,7 @@ def test_build_dependencies(): | |||
|         "flake8", | ||||
|         "hypothesis", | ||||
|         "pre-commit", | ||||
|         "cython-lint", | ||||
|         "black", | ||||
|         "isort", | ||||
|         "mypy", | ||||
|  |  | |||
|  | @ -2,7 +2,7 @@ | |||
| from cymem.cymem cimport Pool | ||||
| 
 | ||||
| from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation | ||||
| from spacy.typedefs cimport class_t, weight_t | ||||
| from spacy.typedefs cimport class_t | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
|  | @ -42,32 +42,35 @@ cdef int destroy(Pool mem, void* state, void* extra_args) except -1: | |||
|     state = <TestState*>state | ||||
|     mem.free(state) | ||||
| 
 | ||||
| 
 | ||||
| @cytest | ||||
| @pytest.mark.parametrize("nr_class,beam_width", | ||||
|     [ | ||||
|         (2, 3), | ||||
|         (3, 6), | ||||
|         (4, 20), | ||||
|     ] | ||||
| ) | ||||
|                          [ | ||||
|                              (2, 3), | ||||
|                              (3, 6), | ||||
|                              (4, 20), | ||||
|                          ] | ||||
|                          ) | ||||
| def test_init(nr_class, beam_width): | ||||
|     b = Beam(nr_class, beam_width) | ||||
|     assert b.size == 1 | ||||
|     assert b.width == beam_width | ||||
|     assert b.nr_class == nr_class | ||||
| 
 | ||||
| 
 | ||||
| @cytest | ||||
| def test_init_violn(): | ||||
|     MaxViolation() | ||||
| 
 | ||||
| 
 | ||||
| @cytest | ||||
| @pytest.mark.parametrize("nr_class,beam_width,length", | ||||
|     [ | ||||
|         (2, 3, 3), | ||||
|         (3, 6, 15), | ||||
|         (4, 20, 32), | ||||
|     ] | ||||
| ) | ||||
|                          [ | ||||
|                              (2, 3, 3), | ||||
|                              (3, 6, 15), | ||||
|                              (4, 20, 32), | ||||
|                          ] | ||||
|                          ) | ||||
| def test_initialize(nr_class, beam_width, length): | ||||
|     b = Beam(nr_class, beam_width) | ||||
|     b.initialize(initialize, destroy, length, NULL) | ||||
|  | @ -79,11 +82,11 @@ def test_initialize(nr_class, beam_width, length): | |||
| 
 | ||||
| @cytest | ||||
| @pytest.mark.parametrize("nr_class,beam_width,length,extra", | ||||
|     [ | ||||
|         (2, 3, 4, None), | ||||
|         (3, 6, 15, u"test beam 1"), | ||||
|     ] | ||||
| ) | ||||
|                          [ | ||||
|                              (2, 3, 4, None), | ||||
|                              (3, 6, 15, u"test beam 1"), | ||||
|                          ] | ||||
|                          ) | ||||
| def test_initialize_extra(nr_class, beam_width, length, extra): | ||||
|     b = Beam(nr_class, beam_width) | ||||
|     if extra is None: | ||||
|  | @ -97,11 +100,11 @@ def test_initialize_extra(nr_class, beam_width, length, extra): | |||
| 
 | ||||
| @cytest | ||||
| @pytest.mark.parametrize("nr_class,beam_width,length", | ||||
|     [ | ||||
|         (3, 6, 15), | ||||
|         (4, 20, 32), | ||||
|     ] | ||||
| ) | ||||
|                          [ | ||||
|                              (3, 6, 15), | ||||
|                              (4, 20, 32), | ||||
|                          ] | ||||
|                          ) | ||||
| def test_transition(nr_class, beam_width, length): | ||||
|     b = Beam(nr_class, beam_width) | ||||
|     b.initialize(initialize, destroy, length, NULL) | ||||
|  |  | |||
|  | @ -230,10 +230,10 @@ def test_overfitting_IO(): | |||
| 
 | ||||
|     # Test scoring | ||||
|     scores = nlp.evaluate(train_examples) | ||||
|     assert f"span_finder_{SPANS_KEY}_f" in scores | ||||
|     assert f"spans_{SPANS_KEY}_f" in scores | ||||
|     # It's not perfect 1.0 F1 because it's designed to overgenerate for now. | ||||
|     assert scores[f"span_finder_{SPANS_KEY}_p"] == 0.75 | ||||
|     assert scores[f"span_finder_{SPANS_KEY}_r"] == 1.0 | ||||
|     assert scores[f"spans_{SPANS_KEY}_p"] == 0.75 | ||||
|     assert scores[f"spans_{SPANS_KEY}_r"] == 1.0 | ||||
| 
 | ||||
|     # also test that the spancat works for just a single entity in a sentence | ||||
|     doc = nlp("London") | ||||
|  |  | |||
|  | @ -192,8 +192,7 @@ def test_tok2vec_listener(with_vectors): | |||
|         for tag in t[1]["tags"]: | ||||
|             tagger.add_label(tag) | ||||
| 
 | ||||
|     # Check that the Tok2Vec component finds it listeners | ||||
|     assert tok2vec.listeners == [] | ||||
|     # Check that the Tok2Vec component finds its listeners | ||||
|     optimizer = nlp.initialize(lambda: train_examples) | ||||
|     assert tok2vec.listeners == [tagger_tok2vec] | ||||
| 
 | ||||
|  | @ -221,7 +220,6 @@ def test_tok2vec_listener_callback(): | |||
|     assert nlp.pipe_names == ["tok2vec", "tagger"] | ||||
|     tagger = nlp.get_pipe("tagger") | ||||
|     tok2vec = nlp.get_pipe("tok2vec") | ||||
|     nlp._link_components() | ||||
|     docs = [nlp.make_doc("A random sentence")] | ||||
|     tok2vec.model.initialize(X=docs) | ||||
|     gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs] | ||||
|  | @ -430,29 +428,46 @@ def test_replace_listeners_from_config(): | |||
|         nlp.to_disk(dir_path) | ||||
|         base_model = str(dir_path) | ||||
|         new_config = { | ||||
|             "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]}, | ||||
|             "nlp": { | ||||
|                 "lang": "en", | ||||
|                 "pipeline": ["tok2vec", "tagger2", "ner3", "tagger4"], | ||||
|             }, | ||||
|             "components": { | ||||
|                 "tok2vec": {"source": base_model}, | ||||
|                 "tagger": { | ||||
|                 "tagger2": { | ||||
|                     "source": base_model, | ||||
|                     "component": "tagger", | ||||
|                     "replace_listeners": ["model.tok2vec"], | ||||
|                 }, | ||||
|                 "ner": {"source": base_model}, | ||||
|                 "ner3": { | ||||
|                     "source": base_model, | ||||
|                     "component": "ner", | ||||
|                 }, | ||||
|                 "tagger4": { | ||||
|                     "source": base_model, | ||||
|                     "component": "tagger", | ||||
|                 }, | ||||
|             }, | ||||
|         } | ||||
|         new_nlp = util.load_model_from_config(new_config, auto_fill=True) | ||||
|     new_nlp.initialize(lambda: examples) | ||||
|     tok2vec = new_nlp.get_pipe("tok2vec") | ||||
|     tagger = new_nlp.get_pipe("tagger") | ||||
|     ner = new_nlp.get_pipe("ner") | ||||
|     assert tok2vec.listening_components == ["ner"] | ||||
|     tagger = new_nlp.get_pipe("tagger2") | ||||
|     ner = new_nlp.get_pipe("ner3") | ||||
|     assert "ner" not in new_nlp.pipe_names | ||||
|     assert "tagger" not in new_nlp.pipe_names | ||||
|     assert tok2vec.listening_components == ["ner3", "tagger4"] | ||||
|     assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk()) | ||||
|     assert not any(isinstance(node, Tok2VecListener) for node in tagger.model.walk()) | ||||
|     t2v_cfg = new_nlp.config["components"]["tok2vec"]["model"] | ||||
|     assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2" | ||||
|     assert new_nlp.config["components"]["tagger"]["model"]["tok2vec"] == t2v_cfg | ||||
|     assert new_nlp.config["components"]["tagger2"]["model"]["tok2vec"] == t2v_cfg | ||||
|     assert ( | ||||
|         new_nlp.config["components"]["ner"]["model"]["tok2vec"]["@architectures"] | ||||
|         new_nlp.config["components"]["ner3"]["model"]["tok2vec"]["@architectures"] | ||||
|         == "spacy.Tok2VecListener.v1" | ||||
|     ) | ||||
|     assert ( | ||||
|         new_nlp.config["components"]["tagger4"]["model"]["tok2vec"]["@architectures"] | ||||
|         == "spacy.Tok2VecListener.v1" | ||||
|     ) | ||||
| 
 | ||||
|  | @ -627,3 +642,57 @@ def test_tok2vec_distillation_teacher_annotations(): | |||
| 
 | ||||
|     student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec) | ||||
|     student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={}) | ||||
| 
 | ||||
| 
 | ||||
| def test_tok2vec_listener_source_link_name(): | ||||
|     """The component's internal name and the tok2vec listener map correspond | ||||
|     to the most recently modified pipeline. | ||||
|     """ | ||||
|     orig_config = Config().from_str(cfg_string_multi) | ||||
|     nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) | ||||
|     assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"] | ||||
| 
 | ||||
|     nlp2 = English() | ||||
|     nlp2.add_pipe("tok2vec", source=nlp1) | ||||
|     nlp2.add_pipe("tagger", name="tagger2", source=nlp1) | ||||
| 
 | ||||
|     # there is no way to have the component have the right name for both | ||||
|     # pipelines, right now the most recently modified pipeline is prioritized | ||||
|     assert nlp1.get_pipe("tagger").name == nlp2.get_pipe("tagger2").name == "tagger2" | ||||
| 
 | ||||
|     # there is no way to have the tok2vec have the right listener map for both | ||||
|     # pipelines, right now the most recently modified pipeline is prioritized | ||||
|     assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"] | ||||
|     nlp2.add_pipe("ner", name="ner3", source=nlp1) | ||||
|     assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2", "ner3"] | ||||
|     nlp2.remove_pipe("ner3") | ||||
|     assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"] | ||||
|     nlp2.remove_pipe("tagger2") | ||||
|     assert nlp2.get_pipe("tok2vec").listening_components == [] | ||||
| 
 | ||||
|     # at this point the tok2vec component corresponds to nlp2 | ||||
|     assert nlp1.get_pipe("tok2vec").listening_components == [] | ||||
| 
 | ||||
|     # modifying the nlp1 pipeline syncs the tok2vec listener map back to nlp1 | ||||
|     nlp1.add_pipe("sentencizer") | ||||
|     assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"] | ||||
| 
 | ||||
|     # modifying nlp2 syncs it back to nlp2 | ||||
|     nlp2.add_pipe("sentencizer") | ||||
|     assert nlp1.get_pipe("tok2vec").listening_components == [] | ||||
| 
 | ||||
| 
 | ||||
| def test_tok2vec_listener_source_replace_listeners(): | ||||
|     orig_config = Config().from_str(cfg_string_multi) | ||||
|     nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) | ||||
|     assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"] | ||||
|     nlp1.replace_listeners("tok2vec", "tagger", ["model.tok2vec"]) | ||||
|     assert nlp1.get_pipe("tok2vec").listening_components == ["ner"] | ||||
| 
 | ||||
|     nlp2 = English() | ||||
|     nlp2.add_pipe("tok2vec", source=nlp1) | ||||
|     assert nlp2.get_pipe("tok2vec").listening_components == [] | ||||
|     nlp2.add_pipe("tagger", source=nlp1) | ||||
|     assert nlp2.get_pipe("tok2vec").listening_components == [] | ||||
|     nlp2.add_pipe("ner", name="ner2", source=nlp1) | ||||
|     assert nlp2.get_pipe("tok2vec").listening_components == ["ner2"] | ||||
|  |  | |||
|  | @ -18,6 +18,7 @@ from spacy.ml.models import ( | |||
|     build_Tok2Vec_model, | ||||
| ) | ||||
| from spacy.schemas import ConfigSchema, ConfigSchemaDistill, ConfigSchemaPretrain | ||||
| from spacy.training import Example | ||||
| from spacy.util import ( | ||||
|     load_config, | ||||
|     load_config_from_str, | ||||
|  | @ -469,6 +470,55 @@ def test_config_overrides(): | |||
|     assert nlp.pipe_names == ["tok2vec", "tagger"] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore:\\[W036") | ||||
| def test_config_overrides_registered_functions(): | ||||
|     nlp = spacy.blank("en") | ||||
|     nlp.add_pipe("attribute_ruler") | ||||
|     with make_tempdir() as d: | ||||
|         nlp.to_disk(d) | ||||
|         nlp_re1 = spacy.load( | ||||
|             d, | ||||
|             config={ | ||||
|                 "components": { | ||||
|                     "attribute_ruler": { | ||||
|                         "scorer": {"@scorers": "spacy.tagger_scorer.v1"} | ||||
|                     } | ||||
|                 } | ||||
|             }, | ||||
|         ) | ||||
|         assert ( | ||||
|             nlp_re1.config["components"]["attribute_ruler"]["scorer"]["@scorers"] | ||||
|             == "spacy.tagger_scorer.v1" | ||||
|         ) | ||||
| 
 | ||||
|         @registry.misc("test_some_other_key") | ||||
|         def misc_some_other_key(): | ||||
|             return "some_other_key" | ||||
| 
 | ||||
|         nlp_re2 = spacy.load( | ||||
|             d, | ||||
|             config={ | ||||
|                 "components": { | ||||
|                     "attribute_ruler": { | ||||
|                         "scorer": { | ||||
|                             "@scorers": "spacy.overlapping_labeled_spans_scorer.v1", | ||||
|                             "spans_key": {"@misc": "test_some_other_key"}, | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             }, | ||||
|         ) | ||||
|         assert nlp_re2.config["components"]["attribute_ruler"]["scorer"][ | ||||
|             "spans_key" | ||||
|         ] == {"@misc": "test_some_other_key"} | ||||
|         # run dummy evaluation (will return None scores) in order to test that | ||||
|         # the spans_key value in the nested override is working as intended in | ||||
|         # the config | ||||
|         example = Example.from_dict(nlp_re2.make_doc("a b c"), {}) | ||||
|         scores = nlp_re2.evaluate([example]) | ||||
|         assert "spans_some_other_key_f" in scores | ||||
| 
 | ||||
| 
 | ||||
| def test_config_interpolation(): | ||||
|     config = Config().from_str(nlp_config_string, interpolate=False) | ||||
|     assert config["corpora"]["train"]["path"] == "${paths.train}" | ||||
|  |  | |||
|  | @ -697,7 +697,6 @@ def test_string_to_list_intify(value): | |||
|     assert string_to_list(value, intify=True) == [1, 2, 3] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.skip(reason="Temporarily skip before models are published") | ||||
| def test_download_compatibility(): | ||||
|     spec = SpecifierSet("==" + about.__version__) | ||||
|     spec.prereleases = False | ||||
|  | @ -708,7 +707,6 @@ def test_download_compatibility(): | |||
|         assert get_minor_version(about.__version__) == get_minor_version(version) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.skip(reason="Temporarily skip before models are published") | ||||
| def test_validate_compatibility_table(): | ||||
|     spec = SpecifierSet("==" + about.__version__) | ||||
|     spec.prereleases = False | ||||
|  |  | |||
|  | @ -377,3 +377,22 @@ def test_displacy_manual_sorted_entities(): | |||
| 
 | ||||
|     html = displacy.render(doc, style="ent", manual=True) | ||||
|     assert html.find("FIRST") < html.find("SECOND") | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.issue(12816) | ||||
| def test_issue12816(en_vocab) -> None: | ||||
|     """Test that displaCy's span visualizer escapes annotated HTML tags correctly.""" | ||||
|     # Create a doc containing an annotated word and an unannotated HTML tag | ||||
|     doc = Doc(en_vocab, words=["test", "<TEST>"]) | ||||
|     doc.spans["sc"] = [Span(doc, 0, 1, label="test")] | ||||
| 
 | ||||
|     # Verify that the HTML tag is escaped when unannotated | ||||
|     html = displacy.render(doc, style="span") | ||||
|     assert "<TEST>" in html | ||||
| 
 | ||||
|     # Annotate the HTML tag | ||||
|     doc.spans["sc"].append(Span(doc, 1, 2, label="test")) | ||||
| 
 | ||||
|     # Verify that the HTML tag is still escaped | ||||
|     html = displacy.render(doc, style="span") | ||||
|     assert "<TEST>" in html | ||||
|  |  | |||
|  | @ -220,6 +220,10 @@ def test_minor_version(a1, a2, b1, b2, is_match): | |||
|             {"training.batch_size": 128, "training.optimizer.learn_rate": 0.01}, | ||||
|             {"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}}, | ||||
|         ), | ||||
|         ( | ||||
|             {"attribute_ruler.scorer.@scorers": "spacy.tagger_scorer.v1"}, | ||||
|             {"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}}, | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_dot_to_dict(dot_notation, expected): | ||||
|  | @ -228,6 +232,29 @@ def test_dot_to_dict(dot_notation, expected): | |||
|     assert util.dict_to_dot(result) == dot_notation | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "dot_notation,expected", | ||||
|     [ | ||||
|         ( | ||||
|             {"token.pos": True, "token._.xyz": True}, | ||||
|             {"token": {"pos": True, "_": {"xyz": True}}}, | ||||
|         ), | ||||
|         ( | ||||
|             {"training.batch_size": 128, "training.optimizer.learn_rate": 0.01}, | ||||
|             {"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}}, | ||||
|         ), | ||||
|         ( | ||||
|             {"attribute_ruler.scorer": {"@scorers": "spacy.tagger_scorer.v1"}}, | ||||
|             {"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}}, | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_dot_to_dict_overrides(dot_notation, expected): | ||||
|     result = util.dot_to_dict(dot_notation) | ||||
|     assert result == expected | ||||
|     assert util.dict_to_dot(result, for_overrides=True) == dot_notation | ||||
| 
 | ||||
| 
 | ||||
| def test_set_dot_to_object(): | ||||
|     config = {"foo": {"bar": 1, "baz": {"x": "y"}}, "test": {"a": {"b": "c"}}} | ||||
|     with pytest.raises(KeyError): | ||||
|  |  | |||
|  | @ -401,6 +401,7 @@ def test_vectors_serialize(): | |||
|         row_r = v_r.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f")) | ||||
|         assert row == row_r | ||||
|         assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data)) | ||||
|         assert v.attr == v_r.attr | ||||
| 
 | ||||
| 
 | ||||
| def test_vector_is_oov(): | ||||
|  | @ -645,3 +646,32 @@ def test_equality(): | |||
|     vectors1.resize((5, 9)) | ||||
|     vectors2.resize((5, 9)) | ||||
|     assert vectors1 == vectors2 | ||||
| 
 | ||||
| 
 | ||||
| def test_vectors_attr(): | ||||
|     data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f") | ||||
|     # default ORTH | ||||
|     nlp = English() | ||||
|     nlp.vocab.vectors = Vectors(data=data, keys=["A", "B", "C"]) | ||||
|     assert nlp.vocab.strings["A"] in nlp.vocab.vectors.key2row | ||||
|     assert nlp.vocab.strings["a"] not in nlp.vocab.vectors.key2row | ||||
|     assert nlp.vocab["A"].has_vector is True | ||||
|     assert nlp.vocab["a"].has_vector is False | ||||
|     assert nlp("A")[0].has_vector is True | ||||
|     assert nlp("a")[0].has_vector is False | ||||
| 
 | ||||
|     # custom LOWER | ||||
|     nlp = English() | ||||
|     nlp.vocab.vectors = Vectors(data=data, keys=["a", "b", "c"], attr="LOWER") | ||||
|     assert nlp.vocab.strings["A"] not in nlp.vocab.vectors.key2row | ||||
|     assert nlp.vocab.strings["a"] in nlp.vocab.vectors.key2row | ||||
|     assert nlp.vocab["A"].has_vector is True | ||||
|     assert nlp.vocab["a"].has_vector is True | ||||
|     assert nlp("A")[0].has_vector is True | ||||
|     assert nlp("a")[0].has_vector is True | ||||
|     # add a new vectors entry | ||||
|     assert nlp.vocab["D"].has_vector is False | ||||
|     assert nlp.vocab["d"].has_vector is False | ||||
|     nlp.vocab.set_vector("D", numpy.asarray([4, 5, 6])) | ||||
|     assert nlp.vocab["D"].has_vector is True | ||||
|     assert nlp.vocab["d"].has_vector is True | ||||
|  |  | |||
|  | @ -26,24 +26,57 @@ cdef class Tokenizer: | |||
| 
 | ||||
|     cdef Doc _tokenize_affixes(self, str string, bint with_special_cases) | ||||
|     cdef int _apply_special_cases(self, Doc doc) except -1 | ||||
|     cdef void _filter_special_spans(self, vector[SpanC] &original, | ||||
|                             vector[SpanC] &filtered, int doc_len) nogil | ||||
|     cdef object _prepare_special_spans(self, Doc doc, | ||||
|                                        vector[SpanC] &filtered) | ||||
|     cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens, | ||||
|                                        object span_data) | ||||
|     cdef int _try_specials_and_cache(self, hash_t key, Doc tokens, | ||||
|                                      int* has_special, | ||||
|                                      bint with_special_cases) except -1 | ||||
|     cdef int _tokenize(self, Doc tokens, str span, hash_t key, | ||||
|                        int* has_special, bint with_special_cases) except -1 | ||||
|     cdef str _split_affixes(self, str string, | ||||
|                                 vector[LexemeC*] *prefixes, | ||||
|                                 vector[LexemeC*] *suffixes, int* has_special, | ||||
|                                 bint with_special_cases) | ||||
|     cdef int _attach_tokens(self, Doc tokens, str string, | ||||
|                             vector[LexemeC*] *prefixes, | ||||
|                             vector[LexemeC*] *suffixes, int* has_special, | ||||
|                             bint with_special_cases) except -1 | ||||
|     cdef int _save_cached(self, const TokenC* tokens, hash_t key, | ||||
|                           int* has_special, int n) except -1 | ||||
|     cdef void _filter_special_spans( | ||||
|         self, | ||||
|         vector[SpanC] &original, | ||||
|         vector[SpanC] &filtered, | ||||
|         int doc_len, | ||||
|     ) nogil | ||||
|     cdef object _prepare_special_spans( | ||||
|         self, | ||||
|         Doc doc, | ||||
|         vector[SpanC] &filtered, | ||||
|     ) | ||||
|     cdef int _retokenize_special_spans( | ||||
|         self, | ||||
|         Doc doc, | ||||
|         TokenC* tokens, | ||||
|         object span_data, | ||||
|     ) | ||||
|     cdef int _try_specials_and_cache( | ||||
|         self, | ||||
|         hash_t key, | ||||
|         Doc tokens, | ||||
|         int* has_special, | ||||
|         bint with_special_cases, | ||||
|     ) except -1 | ||||
|     cdef int _tokenize( | ||||
|         self, | ||||
|         Doc tokens, | ||||
|         str span, | ||||
|         hash_t key, | ||||
|         int* has_special, | ||||
|         bint with_special_cases, | ||||
|     ) except -1 | ||||
|     cdef str _split_affixes( | ||||
|         self, | ||||
|         str string, | ||||
|         vector[LexemeC*] *prefixes, | ||||
|         vector[LexemeC*] *suffixes, int* has_special, | ||||
|         bint with_special_cases, | ||||
|     ) | ||||
|     cdef int _attach_tokens( | ||||
|         self, | ||||
|         Doc tokens, | ||||
|         str string, | ||||
|         vector[LexemeC*] *prefixes, | ||||
|         vector[LexemeC*] *suffixes, int* has_special, | ||||
|         bint with_special_cases, | ||||
|     ) except -1 | ||||
|     cdef int _save_cached( | ||||
|         self, | ||||
|         const TokenC* tokens, | ||||
|         hash_t key, | ||||
|         int* has_special, | ||||
|         int n, | ||||
|     ) except -1 | ||||
|  |  | |||
|  | @ -323,7 +323,7 @@ cdef class Tokenizer: | |||
|         cdef int span_start | ||||
|         cdef int span_end | ||||
|         while i < doc.length: | ||||
|             if not i in span_data: | ||||
|             if i not in span_data: | ||||
|                 tokens[i + offset] = doc.c[i] | ||||
|                 i += 1 | ||||
|             else: | ||||
|  | @ -394,12 +394,14 @@ cdef class Tokenizer: | |||
|         self._save_cached(&tokens.c[orig_size], orig_key, has_special, | ||||
|                           tokens.length - orig_size) | ||||
| 
 | ||||
|     cdef str _split_affixes(self, str string, | ||||
|                                 vector[const LexemeC*] *prefixes, | ||||
|                                 vector[const LexemeC*] *suffixes, | ||||
|                                 int* has_special, | ||||
|                                 bint with_special_cases): | ||||
|         cdef size_t i | ||||
|     cdef str _split_affixes( | ||||
|         self, | ||||
|         str string, | ||||
|         vector[const LexemeC*] *prefixes, | ||||
|         vector[const LexemeC*] *suffixes, | ||||
|         int* has_special, | ||||
|         bint with_special_cases | ||||
|     ): | ||||
|         cdef str prefix | ||||
|         cdef str suffix | ||||
|         cdef str minus_pre | ||||
|  | @ -444,10 +446,6 @@ cdef class Tokenizer: | |||
|                             vector[const LexemeC*] *suffixes, | ||||
|                             int* has_special, | ||||
|                             bint with_special_cases) except -1: | ||||
|         cdef bint specials_hit = 0 | ||||
|         cdef bint cache_hit = 0 | ||||
|         cdef int split, end | ||||
|         cdef const LexemeC* const* lexemes | ||||
|         cdef const LexemeC* lexeme | ||||
|         cdef str span | ||||
|         cdef int i | ||||
|  | @ -457,9 +455,11 @@ cdef class Tokenizer: | |||
|         if string: | ||||
|             if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases): | ||||
|                 pass | ||||
|             elif (self.token_match and self.token_match(string)) or \ | ||||
|                     (self.url_match and \ | ||||
|                     self.url_match(string)): | ||||
|             elif ( | ||||
|                 (self.token_match and self.token_match(string)) or | ||||
|                 (self.url_match and self.url_match(string)) | ||||
|             ): | ||||
| 
 | ||||
|                 # We're always saying 'no' to spaces here -- the caller will | ||||
|                 # fix up the outermost one, with reference to the original. | ||||
|                 # See Issue #859 | ||||
|  | @ -820,7 +820,7 @@ cdef class Tokenizer: | |||
|         self.infix_finditer = None | ||||
|         self.token_match = None | ||||
|         self.url_match = None | ||||
|         msg = util.from_bytes(bytes_data, deserializers, exclude) | ||||
|         util.from_bytes(bytes_data, deserializers, exclude) | ||||
|         if "prefix_search" in data and isinstance(data["prefix_search"], str): | ||||
|             self.prefix_search = re.compile(data["prefix_search"]).search | ||||
|         if "suffix_search" in data and isinstance(data["suffix_search"], str): | ||||
|  |  | |||
|  | @ -31,7 +31,7 @@ cdef int token_by_start(const TokenC* tokens, int length, int start_char) except | |||
| cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2 | ||||
| 
 | ||||
| 
 | ||||
| cdef int [:,:] _get_lca_matrix(Doc, int start, int end) | ||||
| cdef int [:, :] _get_lca_matrix(Doc, int start, int end) | ||||
| 
 | ||||
| 
 | ||||
| cdef class Doc: | ||||
|  | @ -61,7 +61,6 @@ cdef class Doc: | |||
|     cdef int length | ||||
|     cdef int max_length | ||||
| 
 | ||||
| 
 | ||||
|     cdef public object noun_chunks_iterator | ||||
| 
 | ||||
|     cdef object __weakref__ | ||||
|  |  | |||
|  | @ -35,6 +35,7 @@ from ..attrs cimport ( | |||
|     LENGTH, | ||||
|     MORPH, | ||||
|     NORM, | ||||
|     ORTH, | ||||
|     POS, | ||||
|     SENT_START, | ||||
|     SPACY, | ||||
|  | @ -42,14 +43,13 @@ from ..attrs cimport ( | |||
|     attr_id_t, | ||||
| ) | ||||
| from ..lexeme cimport EMPTY_LEXEME, Lexeme | ||||
| from ..typedefs cimport attr_t, flags_t | ||||
| from ..typedefs cimport attr_t | ||||
| from .token cimport Token | ||||
| 
 | ||||
| from .. import parts_of_speech, schemas, util | ||||
| from ..attrs import IDS, intify_attr | ||||
| from ..compat import copy_reg, pickle | ||||
| from ..compat import copy_reg | ||||
| from ..errors import Errors, Warnings | ||||
| from ..morphology import Morphology | ||||
| from ..util import get_words_and_spaces | ||||
| from .retokenizer import Retokenizer | ||||
| from .underscore import Underscore, get_ext_args | ||||
|  | @ -613,13 +613,26 @@ cdef class Doc: | |||
|         """ | ||||
|         if "similarity" in self.user_hooks: | ||||
|             return self.user_hooks["similarity"](self, other) | ||||
|         if isinstance(other, (Lexeme, Token)) and self.length == 1: | ||||
|             if self.c[0].lex.orth == other.orth: | ||||
|         attr = getattr(self.vocab.vectors, "attr", ORTH) | ||||
|         cdef Token this_token | ||||
|         cdef Token other_token | ||||
|         cdef Lexeme other_lex | ||||
|         if len(self) == 1 and isinstance(other, Token): | ||||
|             this_token = self[0] | ||||
|             other_token = other | ||||
|             if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr): | ||||
|                 return 1.0 | ||||
|         elif isinstance(other, (Span, Doc)) and len(self) == len(other): | ||||
|         elif len(self) == 1 and isinstance(other, Lexeme): | ||||
|             this_token = self[0] | ||||
|             other_lex = other | ||||
|             if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr): | ||||
|                 return 1.0 | ||||
|         elif isinstance(other, (Doc, Span)) and len(self) == len(other): | ||||
|             similar = True | ||||
|             for i in range(self.length): | ||||
|                 if self[i].orth != other[i].orth: | ||||
|             for i in range(len(self)): | ||||
|                 this_token = self[i] | ||||
|                 other_token = other[i] | ||||
|                 if Token.get_struct_attr(this_token.c, attr) != Token.get_struct_attr(other_token.c, attr): | ||||
|                     similar = False | ||||
|                     break | ||||
|             if similar: | ||||
|  | @ -767,7 +780,7 @@ cdef class Doc: | |||
|             # TODO: | ||||
|             # 1. Test basic data-driven ORTH gazetteer | ||||
|             # 2. Test more nuanced date and currency regex | ||||
|             cdef attr_t entity_type, kb_id, ent_id | ||||
|             cdef attr_t kb_id, ent_id | ||||
|             cdef int ent_start, ent_end | ||||
|             ent_spans = [] | ||||
|             for ent_info in ents: | ||||
|  | @ -975,7 +988,6 @@ cdef class Doc: | |||
|             >>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) | ||||
|         """ | ||||
|         cdef int i, j | ||||
|         cdef attr_id_t feature | ||||
|         cdef np.ndarray[attr_t, ndim=2] output | ||||
|         # Handle scalar/list inputs of strings/ints for py_attr_ids | ||||
|         # See also #3064 | ||||
|  | @ -987,8 +999,10 @@ cdef class Doc: | |||
|             py_attr_ids = [py_attr_ids] | ||||
|         # Allow strings, e.g. 'lemma' or 'LEMMA' | ||||
|         try: | ||||
|             py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) | ||||
|                        for id_ in py_attr_ids] | ||||
|             py_attr_ids = [ | ||||
|                 (IDS[id_.upper()] if hasattr(id_, "upper") else id_) | ||||
|                 for id_ in py_attr_ids | ||||
|             ] | ||||
|         except KeyError as msg: | ||||
|             keys = list(IDS.keys()) | ||||
|             raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None | ||||
|  | @ -1022,8 +1036,6 @@ cdef class Doc: | |||
|         DOCS: https://spacy.io/api/doc#count_by | ||||
|         """ | ||||
|         cdef int i | ||||
|         cdef attr_t attr | ||||
|         cdef size_t count | ||||
| 
 | ||||
|         if counts is None: | ||||
|             counts = Counter() | ||||
|  | @ -1085,7 +1097,6 @@ cdef class Doc: | |||
|         cdef int i, col | ||||
|         cdef int32_t abs_head_index | ||||
|         cdef attr_id_t attr_id | ||||
|         cdef TokenC* tokens = self.c | ||||
|         cdef int length = len(array) | ||||
|         if length != len(self): | ||||
|             raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self))) | ||||
|  | @ -1226,7 +1237,7 @@ cdef class Doc: | |||
|                             span.label, | ||||
|                             span.kb_id, | ||||
|                             span.id, | ||||
|                             span.text, # included as a check | ||||
|                             span.text,  # included as a check | ||||
|                         )) | ||||
|             char_offset += len(doc.text) | ||||
|             if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_): | ||||
|  | @ -1505,7 +1516,6 @@ cdef class Doc: | |||
|             attributes are inherited from the syntactic root of the span. | ||||
|         RETURNS (Token): The first newly merged token. | ||||
|         """ | ||||
|         cdef str tag, lemma, ent_type | ||||
|         attr_len = len(attributes) | ||||
|         span_len = len(spans) | ||||
|         if not attr_len == span_len: | ||||
|  | @ -1621,7 +1631,6 @@ cdef class Doc: | |||
|                 for token in char_span[1:]: | ||||
|                     token.is_sent_start = False | ||||
| 
 | ||||
| 
 | ||||
|         for span_group in doc_json.get("spans", {}): | ||||
|             spans = [] | ||||
|             for span in doc_json["spans"][span_group]: | ||||
|  | @ -1653,7 +1662,7 @@ cdef class Doc: | |||
|                 start = token_by_char(self.c, self.length, token_data["start"]) | ||||
|                 value = token_data["value"] | ||||
|                 self[start]._.set(token_attr, value) | ||||
|                  | ||||
| 
 | ||||
|         for span_attr in doc_json.get("underscore_span", {}): | ||||
|             if not Span.has_extension(span_attr): | ||||
|                 Span.set_extension(span_attr) | ||||
|  | @ -1699,7 +1708,7 @@ cdef class Doc: | |||
|                 token_data["dep"] = token.dep_ | ||||
|                 token_data["head"] = token.head.i | ||||
|             data["tokens"].append(token_data) | ||||
|          | ||||
| 
 | ||||
|         if self.spans: | ||||
|             data["spans"] = {} | ||||
|             for span_group in self.spans: | ||||
|  | @ -1750,7 +1759,7 @@ cdef class Doc: | |||
|                                     data["underscore_span"] = {} | ||||
|                                 if attr not in data["underscore_span"]: | ||||
|                                     data["underscore_span"][attr] = [] | ||||
|                                 data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id":_span_id}) | ||||
|                                 data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id": _span_id}) | ||||
| 
 | ||||
|             for attr in underscore: | ||||
|                 if attr not in user_keys: | ||||
|  | @ -1773,7 +1782,6 @@ cdef class Doc: | |||
|         output.fill(255) | ||||
|         cdef int i, j, start_idx, end_idx | ||||
|         cdef bytes byte_string | ||||
|         cdef unsigned char utf8_char | ||||
|         for i, byte_string in enumerate(byte_strings): | ||||
|             j = 0 | ||||
|             start_idx = 0 | ||||
|  | @ -1826,8 +1834,6 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2 | |||
| 
 | ||||
| cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1: | ||||
|     # note: end is exclusive | ||||
|     cdef TokenC* head | ||||
|     cdef TokenC* child | ||||
|     cdef int i | ||||
|     # Set number of left/right children to 0. We'll increment it in the loops. | ||||
|     for i in range(start, end): | ||||
|  | @ -1927,7 +1933,7 @@ cdef int _get_tokens_lca(Token token_j, Token token_k): | |||
|     return -1 | ||||
| 
 | ||||
| 
 | ||||
| cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): | ||||
| cdef int [:, :] _get_lca_matrix(Doc doc, int start, int end): | ||||
|     """Given a doc and a start and end position defining a set of contiguous | ||||
|     tokens within it, returns a matrix of Lowest Common Ancestors (LCA), where | ||||
|     LCA[i, j] is the index of the lowest common ancestor among token i and j. | ||||
|  | @ -1940,7 +1946,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): | |||
|     RETURNS (int [:, :]): memoryview of numpy.array[ndim=2, dtype=numpy.int32], | ||||
|         with shape (n, n), where n = len(doc). | ||||
|     """ | ||||
|     cdef int [:,:] lca_matrix | ||||
|     cdef int [:, :] lca_matrix | ||||
|     cdef int j, k | ||||
|     n_tokens= end - start | ||||
|     lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32) | ||||
|  |  | |||
|  | @ -3,7 +3,7 @@ from typing import Generator, List, Tuple | |||
| 
 | ||||
| cimport cython | ||||
| from cython.operator cimport dereference | ||||
| from libc.stdint cimport int32_t, int64_t | ||||
| from libc.stdint cimport int32_t | ||||
| from libcpp.pair cimport pair | ||||
| from libcpp.unordered_map cimport unordered_map | ||||
| from libcpp.unordered_set cimport unordered_set | ||||
|  | @ -11,7 +11,6 @@ from libcpp.unordered_set cimport unordered_set | |||
| import weakref | ||||
| 
 | ||||
| from murmurhash.mrmr cimport hash64 | ||||
| from preshed.maps cimport map_get_unless_missing | ||||
| 
 | ||||
| from .. import Errors | ||||
| 
 | ||||
|  | @ -26,7 +25,7 @@ from .token import Token | |||
| cdef class Edge: | ||||
|     cdef readonly Graph graph | ||||
|     cdef readonly int i | ||||
|      | ||||
| 
 | ||||
|     def __init__(self, Graph graph, int i): | ||||
|         self.graph = graph | ||||
|         self.i = i | ||||
|  | @ -42,7 +41,7 @@ cdef class Edge: | |||
|     @property | ||||
|     def head(self) -> "Node": | ||||
|         return Node(self.graph, self.graph.c.edges[self.i].head) | ||||
|      | ||||
| 
 | ||||
|     @property | ||||
|     def tail(self) -> "Tail": | ||||
|         return Node(self.graph, self.graph.c.edges[self.i].tail) | ||||
|  | @ -68,7 +67,7 @@ cdef class Node: | |||
|     def __init__(self, Graph graph, int i): | ||||
|         """A reference to a node of an annotation graph. Each node is made up of | ||||
|         an ordered set of zero or more token indices. | ||||
|          | ||||
| 
 | ||||
|         Node references are usually created by the Graph object itself, or from | ||||
|         the Node or Edge objects. You usually won't need to instantiate this | ||||
|         class yourself. | ||||
|  | @ -107,13 +106,13 @@ cdef class Node: | |||
|     @property | ||||
|     def is_none(self) -> bool: | ||||
|         """Whether the node is a special value, indicating 'none'. | ||||
|          | ||||
| 
 | ||||
|         The NoneNode type is returned by the Graph, Edge and Node objects when | ||||
|         there is no match to a query. It has the same API as Node, but it always | ||||
|         returns NoneNode, NoneEdge or empty lists for its queries. | ||||
|         """ | ||||
|         return False | ||||
|   | ||||
| 
 | ||||
|     @property | ||||
|     def doc(self) -> "Doc": | ||||
|         """The Doc object that the graph refers to.""" | ||||
|  | @ -128,19 +127,19 @@ cdef class Node: | |||
|     def head(self, i=None, label=None) -> "Node": | ||||
|         """Get the head of the first matching edge, searching by index, label, | ||||
|         both or neither. | ||||
|          | ||||
| 
 | ||||
|         For instance, `node.head(i=1)` will get the head of the second edge that | ||||
|         this node is a tail of. `node.head(i=1, label="ARG0")` will further | ||||
|         check that the second edge has the label `"ARG0"`.  | ||||
|          | ||||
| 
 | ||||
|         If no matching node can be found, the graph's NoneNode is returned.  | ||||
|         """ | ||||
|         return self.headed(i=i, label=label) | ||||
|      | ||||
| 
 | ||||
|     def tail(self, i=None, label=None) -> "Node": | ||||
|         """Get the tail of the first matching edge, searching by index, label, | ||||
|         both or neither. | ||||
|   | ||||
| 
 | ||||
|         If no matching node can be found, the graph's NoneNode is returned.  | ||||
|         """ | ||||
|         return self.tailed(i=i, label=label).tail | ||||
|  | @ -169,7 +168,7 @@ cdef class Node: | |||
|         cdef vector[int] edge_indices | ||||
|         self._find_edges(edge_indices, "head", label) | ||||
|         return [Node(self.graph, self.graph.c.edges[i].head) for i in edge_indices] | ||||
|       | ||||
| 
 | ||||
|     def tails(self, label=None) -> List["Node"]: | ||||
|         """Find all matching tails of this node.""" | ||||
|         cdef vector[int] edge_indices | ||||
|  | @ -198,7 +197,7 @@ cdef class Node: | |||
|             return NoneEdge(self.graph) | ||||
|         else: | ||||
|             return Edge(self.graph, idx) | ||||
|      | ||||
| 
 | ||||
|     def tailed(self, i=None, label=None) -> Edge: | ||||
|         """Find the first matching edge tailed by this node. | ||||
|         If no matching edge can be found, the graph's NoneEdge is returned. | ||||
|  | @ -281,7 +280,7 @@ cdef class NoneEdge(Edge): | |||
|     def __init__(self, graph): | ||||
|         self.graph = graph | ||||
|         self.i = -1 | ||||
|     | ||||
| 
 | ||||
|     @property | ||||
|     def doc(self) -> "Doc": | ||||
|         return self.graph.doc | ||||
|  | @ -289,7 +288,7 @@ cdef class NoneEdge(Edge): | |||
|     @property | ||||
|     def head(self) -> "NoneNode": | ||||
|         return NoneNode(self.graph) | ||||
|      | ||||
| 
 | ||||
|     @property | ||||
|     def tail(self) -> "NoneNode": | ||||
|         return NoneNode(self.graph) | ||||
|  | @ -317,7 +316,7 @@ cdef class NoneNode(Node): | |||
| 
 | ||||
|     def __len__(self): | ||||
|         return 0 | ||||
|   | ||||
| 
 | ||||
|     @property | ||||
|     def is_none(self): | ||||
|         return -1 | ||||
|  | @ -338,14 +337,14 @@ cdef class NoneNode(Node): | |||
| 
 | ||||
|     def walk_heads(self): | ||||
|         yield from []  | ||||
|      | ||||
| 
 | ||||
|     def walk_tails(self): | ||||
|         yield from []  | ||||
|   | ||||
| 
 | ||||
| 
 | ||||
| cdef class Graph: | ||||
|     """A set of directed labelled relationships between sets of tokens. | ||||
|      | ||||
| 
 | ||||
|     EXAMPLE: | ||||
|         Construction 1 | ||||
|         >>> graph = Graph(doc, name="srl") | ||||
|  | @ -370,7 +369,9 @@ cdef class Graph: | |||
|         >>> assert graph.has_node((0,)) | ||||
|         >>> assert graph.has_edge((0,), (1,3), label="agent") | ||||
|     """ | ||||
|     def __init__(self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None): | ||||
|     def __init__( | ||||
|         self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None  # no-cython-lint | ||||
|     ): | ||||
|         """Create a Graph object. | ||||
| 
 | ||||
|         doc (Doc): The Doc object the graph will refer to. | ||||
|  | @ -436,13 +437,11 @@ cdef class Graph: | |||
| 
 | ||||
|     def add_edge(self, head, tail, *, label="", weight=None) -> Edge: | ||||
|         """Add an edge to the graph, connecting two groups of tokens. | ||||
|         | ||||
| 
 | ||||
|         If there is already an edge for the (head, tail, label) triple, it will | ||||
|         be returned, and no new edge will be created. The weight of the edge | ||||
|         will be updated if a weight is specified. | ||||
|         """ | ||||
|         label_hash = self.doc.vocab.strings.as_int(label) | ||||
|         weight_float = weight if weight is not None else 0.0 | ||||
|         edge_index = add_edge( | ||||
|             &self.c, | ||||
|             EdgeC( | ||||
|  | @ -476,11 +475,11 @@ cdef class Graph: | |||
|     def has_edge(self, head, tail, label) -> bool: | ||||
|         """Check whether a (head, tail, label) triple is an edge in the graph.""" | ||||
|         return not self.get_edge(head, tail, label=label).is_none | ||||
|      | ||||
| 
 | ||||
|     def add_node(self, indices) -> Node: | ||||
|         """Add a node to the graph and return it. Nodes refer to ordered sets | ||||
|         of token indices. | ||||
|          | ||||
| 
 | ||||
|         This method is idempotent: if there is already a node for the given | ||||
|         indices, it is returned without a new node being created. | ||||
|         """ | ||||
|  | @ -508,7 +507,7 @@ cdef class Graph: | |||
|             return NoneNode(self) | ||||
|         else: | ||||
|             return Node(self, node_index) | ||||
|   | ||||
| 
 | ||||
|     def has_node(self, tuple indices) -> bool: | ||||
|         """Check whether the graph has a node for the given indices.""" | ||||
|         return not self.get_node(indices).is_none | ||||
|  | @ -568,7 +567,7 @@ cdef int add_node(GraphC* graph, vector[int32_t]& node) nogil: | |||
|         graph.roots.insert(index) | ||||
|         graph.node_map.insert(pair[hash_t, int](key, index)) | ||||
|         return index | ||||
|   | ||||
| 
 | ||||
| 
 | ||||
| cdef int get_node(const GraphC* graph, vector[int32_t] node) nogil: | ||||
|     key = hash64(&node[0], node.size() * sizeof(node[0]), 0) | ||||
|  |  | |||
|  | @ -1,5 +1,4 @@ | |||
| cimport numpy as np | ||||
| from libc.string cimport memset | ||||
| 
 | ||||
| from ..errors import Errors | ||||
| from ..morphology import Morphology | ||||
|  | @ -94,4 +93,3 @@ cdef class MorphAnalysis: | |||
| 
 | ||||
|     def __repr__(self): | ||||
|         return self.to_json() | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,7 +1,6 @@ | |||
| # cython: infer_types=True, bounds_check=False, profile=True | ||||
| from cymem.cymem cimport Pool | ||||
| from libc.stdlib cimport free, malloc | ||||
| from libc.string cimport memcpy, memset | ||||
| from libc.string cimport memset | ||||
| 
 | ||||
| import numpy | ||||
| from thinc.api import get_array_module | ||||
|  | @ -10,7 +9,7 @@ from ..attrs cimport MORPH, NORM | |||
| from ..lexeme cimport EMPTY_LEXEME, Lexeme | ||||
| from ..structs cimport LexemeC, TokenC | ||||
| from ..vocab cimport Vocab | ||||
| from .doc cimport Doc, set_children_from_heads, token_by_end, token_by_start | ||||
| from .doc cimport Doc, set_children_from_heads, token_by_start | ||||
| from .span cimport Span | ||||
| from .token cimport Token | ||||
| 
 | ||||
|  | @ -148,7 +147,7 @@ def _merge(Doc doc, merges): | |||
|         syntactic root of the span. | ||||
|     RETURNS (Token): The first newly merged token. | ||||
|     """ | ||||
|     cdef int i, merge_index, start, end, token_index, current_span_index, current_offset, offset, span_index | ||||
|     cdef int i, merge_index, start, token_index, current_span_index, current_offset, offset, span_index | ||||
|     cdef Span span | ||||
|     cdef const LexemeC* lex | ||||
|     cdef TokenC* token | ||||
|  | @ -166,7 +165,6 @@ def _merge(Doc doc, merges): | |||
|     merges.sort(key=_get_start) | ||||
|     for merge_index, (span, attributes) in enumerate(merges): | ||||
|         start = span.start | ||||
|         end = span.end | ||||
|         spans.append(span) | ||||
|         # House the new merged token where it starts | ||||
|         token = &doc.c[start] | ||||
|  | @ -204,8 +202,9 @@ def _merge(Doc doc, merges): | |||
|     # for the merged region. To do this, we create a boolean array indicating | ||||
|     # whether the row is to be deleted, then use numpy.delete | ||||
|     if doc.tensor is not None and doc.tensor.size != 0: | ||||
|         doc.tensor = _resize_tensor(doc.tensor, | ||||
|             [(m[0].start, m[0].end) for m in merges]) | ||||
|         doc.tensor = _resize_tensor( | ||||
|             doc.tensor, [(m[0].start, m[0].end) for m in merges] | ||||
|         ) | ||||
|     # Memorize span roots and sets dependencies of the newly merged | ||||
|     # tokens to the dependencies of their roots. | ||||
|     span_roots = [] | ||||
|  | @ -268,11 +267,11 @@ def _merge(Doc doc, merges): | |||
|             span_index += 1 | ||||
|         if span_index < len(spans) and i == spans[span_index].start: | ||||
|             # First token in a span | ||||
|             doc.c[i - offset] = doc.c[i] # move token to its place | ||||
|             doc.c[i - offset] = doc.c[i]  # move token to its place | ||||
|             offset += (spans[span_index].end - spans[span_index].start) - 1 | ||||
|             in_span = True | ||||
|         if not in_span: | ||||
|             doc.c[i - offset] = doc.c[i] # move token to its place | ||||
|             doc.c[i - offset] = doc.c[i]  # move token to its place | ||||
| 
 | ||||
|     for i in range(doc.length - offset, doc.length): | ||||
|         memset(&doc.c[i], 0, sizeof(TokenC)) | ||||
|  | @ -346,7 +345,11 @@ def _split(Doc doc, int token_index, orths, heads, attrs): | |||
|     if to_process_tensor: | ||||
|         xp = get_array_module(doc.tensor) | ||||
|         if xp is numpy: | ||||
|             doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0) | ||||
|             doc.tensor = xp.append( | ||||
|                 doc.tensor, | ||||
|                 xp.zeros((nb_subtokens, doc.tensor.shape[1]), dtype="float32"), | ||||
|                 axis=0 | ||||
|             ) | ||||
|         else: | ||||
|             shape = (doc.tensor.shape[0] + nb_subtokens, doc.tensor.shape[1]) | ||||
|             resized_array = xp.zeros(shape, dtype="float32") | ||||
|  | @ -368,7 +371,8 @@ def _split(Doc doc, int token_index, orths, heads, attrs): | |||
|         token.norm = 0  # reset norm | ||||
|         if to_process_tensor: | ||||
|             # setting the tensors of the split tokens to array of zeros | ||||
|             doc.tensor[token_index + i:token_index + i + 1] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32") | ||||
|             doc.tensor[token_index + i:token_index + i + 1] = \ | ||||
|                 xp.zeros((1, doc.tensor.shape[1]), dtype="float32") | ||||
|         # Update the character offset of the subtokens | ||||
|         if i != 0: | ||||
|             token.idx = orig_token.idx + idx_offset | ||||
|  | @ -456,7 +460,6 @@ def normalize_token_attrs(Vocab vocab, attrs): | |||
| def set_token_attrs(Token py_token, attrs): | ||||
|     cdef TokenC* token = py_token.c | ||||
|     cdef const LexemeC* lex = token.lex | ||||
|     cdef Doc doc = py_token.doc | ||||
|     # Assign attributes | ||||
|     for attr_name, attr_value in attrs.items(): | ||||
|         if attr_name == "_":  # Set extension attributes | ||||
|  |  | |||
|  | @ -1,5 +1,4 @@ | |||
| cimport numpy as np | ||||
| from libc.math cimport sqrt | ||||
| from libcpp.memory cimport make_shared | ||||
| 
 | ||||
| import copy | ||||
|  | @ -9,13 +8,13 @@ import numpy | |||
| from thinc.api import get_array_module | ||||
| 
 | ||||
| from ..attrs cimport * | ||||
| from ..attrs cimport attr_id_t | ||||
| from ..attrs cimport ORTH, attr_id_t | ||||
| from ..lexeme cimport Lexeme | ||||
| from ..parts_of_speech cimport univ_pos_t | ||||
| from ..structs cimport LexemeC, TokenC | ||||
| from ..structs cimport TokenC | ||||
| from ..symbols cimport dep | ||||
| from ..typedefs cimport attr_t, flags_t, hash_t | ||||
| from ..typedefs cimport attr_t | ||||
| from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start | ||||
| from .token cimport Token | ||||
| 
 | ||||
| from ..errors import Errors, Warnings | ||||
| from ..util import normalize_slice | ||||
|  | @ -226,8 +225,8 @@ cdef class Span: | |||
| 
 | ||||
|     @property | ||||
|     def _(self): | ||||
|         cdef SpanC* span_c = self.span_c() | ||||
|         """Custom extension attributes registered via `set_extension`.""" | ||||
|         cdef SpanC* span_c = self.span_c() | ||||
|         return Underscore(Underscore.span_extensions, self, | ||||
|                           start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id) | ||||
| 
 | ||||
|  | @ -371,13 +370,26 @@ cdef class Span: | |||
|         """ | ||||
|         if "similarity" in self.doc.user_span_hooks: | ||||
|             return self.doc.user_span_hooks["similarity"](self, other) | ||||
|         if len(self) == 1 and hasattr(other, "orth"): | ||||
|             if self[0].orth == other.orth: | ||||
|         attr = getattr(self.doc.vocab.vectors, "attr", ORTH) | ||||
|         cdef Token this_token | ||||
|         cdef Token other_token | ||||
|         cdef Lexeme other_lex | ||||
|         if len(self) == 1 and isinstance(other, Token): | ||||
|             this_token = self[0] | ||||
|             other_token = other | ||||
|             if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr): | ||||
|                 return 1.0 | ||||
|         elif len(self) == 1 and isinstance(other, Lexeme): | ||||
|             this_token = self[0] | ||||
|             other_lex = other | ||||
|             if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr): | ||||
|                 return 1.0 | ||||
|         elif isinstance(other, (Doc, Span)) and len(self) == len(other): | ||||
|             similar = True | ||||
|             for i in range(len(self)): | ||||
|                 if self[i].orth != getattr(other[i], "orth", None): | ||||
|                 this_token = self[i] | ||||
|                 other_token = other[i] | ||||
|                 if Token.get_struct_attr(this_token.c, attr) != Token.get_struct_attr(other_token.c, attr): | ||||
|                     similar = False | ||||
|                     break | ||||
|             if similar: | ||||
|  | @ -607,7 +619,6 @@ cdef class Span: | |||
|         """ | ||||
|         return "".join([t.text_with_ws for t in self]) | ||||
| 
 | ||||
| 
 | ||||
|     @property | ||||
|     def noun_chunks(self): | ||||
|         """Iterate over the base noun phrases in the span. Yields base | ||||
|  | @ -922,7 +933,6 @@ cdef class Span: | |||
|             self.id_ = ent_id_ | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: | ||||
|     # Don't allow spaces to be the root, if there are | ||||
|     # better candidates | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| import struct | ||||
| import weakref | ||||
| from copy import deepcopy | ||||
| from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Union | ||||
| from typing import Iterable, Optional, Union | ||||
| 
 | ||||
| import srsly | ||||
| 
 | ||||
|  | @ -36,7 +36,7 @@ cdef class SpanGroup: | |||
| 
 | ||||
|     DOCS: https://spacy.io/api/spangroup | ||||
|     """ | ||||
|     def __init__(self, doc, *, name="", attrs={}, spans=[]): | ||||
|     def __init__(self, doc, *, name="", attrs={}, spans=[]):  # no-cython-lint | ||||
|         """Create a SpanGroup. | ||||
| 
 | ||||
|         doc (Doc): The reference Doc object. | ||||
|  | @ -315,7 +315,7 @@ cdef class SpanGroup: | |||
| 
 | ||||
|             other_attrs = deepcopy(other_group.attrs) | ||||
|             span_group.attrs.update({ | ||||
|                 key: value for key, value in other_attrs.items() \ | ||||
|                 key: value for key, value in other_attrs.items() | ||||
|                 if key not in span_group.attrs | ||||
|             }) | ||||
|             if len(other_group): | ||||
|  |  | |||
|  | @ -26,7 +26,7 @@ cdef class Token: | |||
|         cdef Token self = Token.__new__(Token, vocab, doc, offset) | ||||
|         return self | ||||
| 
 | ||||
|     #cdef inline TokenC struct_from_attrs(Vocab vocab, attrs): | ||||
|     # cdef inline TokenC struct_from_attrs(Vocab vocab, attrs): | ||||
|     #    cdef TokenC token | ||||
|     #    attrs = normalize_attrs(attrs) | ||||
| 
 | ||||
|  | @ -98,12 +98,10 @@ cdef class Token: | |||
|         elif feat_name == SENT_START: | ||||
|             token.sent_start = value | ||||
| 
 | ||||
| 
 | ||||
|     @staticmethod | ||||
|     cdef inline int missing_dep(const TokenC* token) nogil: | ||||
|         return token.dep == MISSING_DEP | ||||
| 
 | ||||
| 
 | ||||
|     @staticmethod | ||||
|     cdef inline int missing_head(const TokenC* token) nogil: | ||||
|         return Token.missing_dep(token) | ||||
|  |  | |||
|  | @ -1,13 +1,11 @@ | |||
| # cython: infer_types=True | ||||
| # Compiler crashes on memory view coercion without this. Should report bug. | ||||
| cimport numpy as np | ||||
| from cython.view cimport array as cvarray | ||||
| 
 | ||||
| np.import_array() | ||||
| 
 | ||||
| import warnings | ||||
| 
 | ||||
| import numpy | ||||
| from thinc.api import get_array_module | ||||
| 
 | ||||
| from ..attrs cimport ( | ||||
|  | @ -216,11 +214,17 @@ cdef class Token: | |||
|         """ | ||||
|         if "similarity" in self.doc.user_token_hooks: | ||||
|             return self.doc.user_token_hooks["similarity"](self, other) | ||||
|         if hasattr(other, "__len__") and len(other) == 1 and hasattr(other, "__getitem__"): | ||||
|             if self.c.lex.orth == getattr(other[0], "orth", None): | ||||
|         attr = getattr(self.doc.vocab.vectors, "attr", ORTH) | ||||
|         cdef Token this_token = self | ||||
|         cdef Token other_token | ||||
|         cdef Lexeme other_lex | ||||
|         if isinstance(other, Token): | ||||
|             other_token = other | ||||
|             if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr): | ||||
|                 return 1.0 | ||||
|         elif hasattr(other, "orth"): | ||||
|             if self.c.lex.orth == other.orth: | ||||
|         elif isinstance(other, Lexeme): | ||||
|             other_lex = other | ||||
|             if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr): | ||||
|                 return 1.0 | ||||
|         if self.vocab.vectors.n_keys == 0: | ||||
|             warnings.warn(Warnings.W007.format(obj="Token")) | ||||
|  | @ -233,7 +237,7 @@ cdef class Token: | |||
|         result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) | ||||
|         # ensure we get a scalar back (numpy does this automatically but cupy doesn't) | ||||
|         return result.item() | ||||
|      | ||||
| 
 | ||||
|     def has_morph(self): | ||||
|         """Check whether the token has annotated morph information. | ||||
|         Return False when the morph annotation is unset/missing. | ||||
|  | @ -528,9 +532,9 @@ cdef class Token: | |||
|         def __get__(self): | ||||
|             if self.i + 1 == len(self.doc): | ||||
|                 return True | ||||
|             elif self.doc[self.i+1].is_sent_start == None: | ||||
|             elif self.doc[self.i+1].is_sent_start is None: | ||||
|                 return None | ||||
|             elif self.doc[self.i+1].is_sent_start == True: | ||||
|             elif self.doc[self.i+1].is_sent_start is True: | ||||
|                 return True | ||||
|             else: | ||||
|                 return False | ||||
|  |  | |||
|  | @ -37,10 +37,14 @@ def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[Li | |||
|             b2a.append(set()) | ||||
|         # Process the alignment at the current position | ||||
|         if A[token_idx_a] == B[token_idx_b] and \ | ||||
|                 (char_idx_a == 0 or \ | ||||
|                     char_to_token_a[char_idx_a - 1] < token_idx_a) and \ | ||||
|                 (char_idx_b == 0 or \ | ||||
|                     char_to_token_b[char_idx_b - 1] < token_idx_b): | ||||
|                 ( | ||||
|                     char_idx_a == 0 or | ||||
|                     char_to_token_a[char_idx_a - 1] < token_idx_a | ||||
|                 ) and \ | ||||
|                 ( | ||||
|                     char_idx_b == 0 or | ||||
|                     char_to_token_b[char_idx_b - 1] < token_idx_b | ||||
|                 ): | ||||
|             # Current tokens are identical and both character offsets are the | ||||
|             # start of a token (either at the beginning of the document or the | ||||
|             # previous character belongs to a different token) | ||||
|  |  | |||
|  | @ -1,4 +1,3 @@ | |||
| import warnings | ||||
| from collections.abc import Iterable as IterableInstance | ||||
| 
 | ||||
| import numpy | ||||
|  | @ -31,9 +30,9 @@ cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot): | |||
|     attrs, array = _annot2array(vocab, tok_annot, doc_annot) | ||||
|     output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) | ||||
|     if "entities" in doc_annot: | ||||
|        _add_entities_to_doc(output, doc_annot["entities"]) | ||||
|         _add_entities_to_doc(output, doc_annot["entities"]) | ||||
|     if "spans" in doc_annot: | ||||
|        _add_spans_to_doc(output, doc_annot["spans"]) | ||||
|         _add_spans_to_doc(output, doc_annot["spans"]) | ||||
|     if array.size: | ||||
|         output = output.from_array(attrs, array) | ||||
|     # links are currently added with ENT_KB_ID on the token level | ||||
|  | @ -168,7 +167,6 @@ cdef class Example: | |||
|                 self._y_sig = y_sig | ||||
|                 return self._cached_alignment | ||||
| 
 | ||||
| 
 | ||||
|     def _get_aligned_vectorized(self, align, gold_values): | ||||
|         # Fast path for Doc attributes/fields that are predominantly a single value, | ||||
|         # i.e., TAG, POS, MORPH. | ||||
|  | @ -211,7 +209,6 @@ cdef class Example: | |||
| 
 | ||||
|         return output.tolist() | ||||
| 
 | ||||
| 
 | ||||
|     def _get_aligned_non_vectorized(self, align, gold_values): | ||||
|         # Slower path for fields that return multiple values (resulting | ||||
|         # in ragged arrays that cannot be vectorized trivially). | ||||
|  | @ -228,7 +225,6 @@ cdef class Example: | |||
| 
 | ||||
|         return output | ||||
| 
 | ||||
| 
 | ||||
|     def get_aligned(self, field, as_string=False): | ||||
|         """Return an aligned array for a token attribute.""" | ||||
|         align = self.alignment.x2y | ||||
|  | @ -337,7 +333,7 @@ cdef class Example: | |||
|             missing=None | ||||
|         ) | ||||
|         # Now fill the tokens we can align to O. | ||||
|         O = 2 # I=1, O=2, B=3 | ||||
|         O = 2 # I=1, O=2, B=3  # no-cython-lint: E741 | ||||
|         for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")): | ||||
|             if x_tags[i] is None: | ||||
|                 if ent_iob == O: | ||||
|  | @ -347,7 +343,7 @@ cdef class Example: | |||
|         return x_ents, x_tags | ||||
| 
 | ||||
|     def get_aligned_ner(self): | ||||
|         x_ents, x_tags = self.get_aligned_ents_and_ner() | ||||
|         _x_ents, x_tags = self.get_aligned_ents_and_ner() | ||||
|         return x_tags | ||||
| 
 | ||||
|     def get_matching_ents(self, check_label=True): | ||||
|  | @ -405,7 +401,6 @@ cdef class Example: | |||
| 
 | ||||
|         return span_dict | ||||
| 
 | ||||
| 
 | ||||
|     def _links_to_dict(self): | ||||
|         links = {} | ||||
|         for ent in self.reference.ents: | ||||
|  | @ -596,6 +591,7 @@ def _fix_legacy_dict_data(example_dict): | |||
|         "doc_annotation": doc_dict | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
| def _has_field(annot, field): | ||||
|     if field not in annot: | ||||
|         return False | ||||
|  | @ -632,6 +628,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces): | |||
|                 ent_types.append("") | ||||
|     return ent_iobs, ent_types | ||||
| 
 | ||||
| 
 | ||||
| def _parse_links(vocab, words, spaces, links): | ||||
|     reference = Doc(vocab, words=words, spaces=spaces) | ||||
|     starts = {token.idx: token.i for token in reference} | ||||
|  |  | |||
|  | @ -1,4 +1,3 @@ | |||
| import json | ||||
| import warnings | ||||
| 
 | ||||
| import srsly | ||||
|  | @ -6,7 +5,7 @@ import srsly | |||
| from .. import util | ||||
| from ..errors import Warnings | ||||
| from ..tokens import Doc | ||||
| from .iob_utils import offsets_to_biluo_tags, tags_to_entities | ||||
| from .iob_utils import offsets_to_biluo_tags | ||||
| 
 | ||||
| 
 | ||||
| def docs_to_json(docs, doc_id=0, ner_missing_tag="O"): | ||||
|  | @ -23,7 +22,13 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"): | |||
|     json_doc = {"id": doc_id, "paragraphs": []} | ||||
|     for i, doc in enumerate(docs): | ||||
|         raw = None if doc.has_unknown_spaces else doc.text | ||||
|         json_para = {'raw': raw, "sentences": [], "cats": [], "entities": [], "links": []} | ||||
|         json_para = { | ||||
|             'raw': raw, | ||||
|             "sentences": [], | ||||
|             "cats": [], | ||||
|             "entities": [], | ||||
|             "links": [] | ||||
|         } | ||||
|         for cat, val in doc.cats.items(): | ||||
|             json_cat = {"label": cat, "value": val} | ||||
|             json_para["cats"].append(json_cat) | ||||
|  | @ -35,13 +40,17 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"): | |||
|             if ent.kb_id_: | ||||
|                 link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}} | ||||
|                 json_para["links"].append(link_dict) | ||||
|         biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag) | ||||
|         biluo_tags = offsets_to_biluo_tags( | ||||
|             doc, json_para["entities"], missing=ner_missing_tag | ||||
|         ) | ||||
|         attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB") | ||||
|         include_annotation = {attr: doc.has_annotation(attr) for attr in attrs} | ||||
|         for j, sent in enumerate(doc.sents): | ||||
|             json_sent = {"tokens": [], "brackets": []} | ||||
|             for token in sent: | ||||
|                 json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_} | ||||
|                 json_token = { | ||||
|                     "id": token.i, "orth": token.text, "space": token.whitespace_ | ||||
|                 } | ||||
|                 if include_annotation["TAG"]: | ||||
|                     json_token["tag"] = token.tag_ | ||||
|                 if include_annotation["POS"]: | ||||
|  | @ -125,9 +134,14 @@ def json_to_annotations(doc): | |||
|                 else: | ||||
|                     sent_starts.append(-1) | ||||
|             if "brackets" in sent: | ||||
|                 brackets.extend((b["first"] + sent_start_i, | ||||
|                                  b["last"] + sent_start_i, b["label"]) | ||||
|                                  for b in sent["brackets"]) | ||||
|                 brackets.extend( | ||||
|                     ( | ||||
|                         b["first"] + sent_start_i, | ||||
|                         b["last"] + sent_start_i, | ||||
|                         b["label"] | ||||
|                     ) | ||||
|                     for b in sent["brackets"] | ||||
|                 ) | ||||
| 
 | ||||
|         example["token_annotation"] = dict( | ||||
|             ids=ids, | ||||
|  | @ -160,6 +174,7 @@ def json_to_annotations(doc): | |||
|         ) | ||||
|         yield example | ||||
| 
 | ||||
| 
 | ||||
| def json_iterate(bytes utf8_str): | ||||
|     # We should've made these files jsonl...But since we didn't, parse out | ||||
|     # the docs one-by-one to reduce memory usage. | ||||
|  |  | |||
|  | @ -71,7 +71,8 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": | |||
|         with nlp.select_pipes(enable=resume_components): | ||||
|             logger.info("Resuming training for: %s", resume_components) | ||||
|             nlp.resume_training(sgd=optimizer) | ||||
|     # Make sure that listeners are defined before initializing further | ||||
|     # Make sure that internal component names are synced and listeners are | ||||
|     # defined before initializing further | ||||
|     nlp._link_components() | ||||
|     with nlp.select_pipes(disable=[*frozen_components, *resume_components]): | ||||
|         if T["max_epochs"] == -1: | ||||
|  | @ -305,9 +306,14 @@ def convert_vectors( | |||
|     truncate: int, | ||||
|     prune: int, | ||||
|     mode: str = VectorsMode.default, | ||||
|     attr: str = "ORTH", | ||||
| ) -> None: | ||||
|     vectors_loc = ensure_path(vectors_loc) | ||||
|     if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): | ||||
|         if attr != "ORTH": | ||||
|             raise ValueError( | ||||
|                 "ORTH is the only attribute supported for vectors in .npz format." | ||||
|             ) | ||||
|         nlp.vocab.vectors = Vectors( | ||||
|             strings=nlp.vocab.strings, data=numpy.load(vectors_loc.open("rb")) | ||||
|         ) | ||||
|  | @ -335,11 +341,15 @@ def convert_vectors( | |||
|                 nlp.vocab.vectors = Vectors( | ||||
|                     strings=nlp.vocab.strings, | ||||
|                     data=vectors_data, | ||||
|                     attr=attr, | ||||
|                     **floret_settings, | ||||
|                 ) | ||||
|             else: | ||||
|                 nlp.vocab.vectors = Vectors( | ||||
|                     strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys | ||||
|                     strings=nlp.vocab.strings, | ||||
|                     data=vectors_data, | ||||
|                     keys=vector_keys, | ||||
|                     attr=attr, | ||||
|                 ) | ||||
|                 nlp.vocab.deduplicate_vectors() | ||||
|     if prune >= 1 and mode != VectorsMode.floret: | ||||
|  |  | |||
|  | @ -518,7 +518,7 @@ def load_model_from_path( | |||
|     if not meta: | ||||
|         meta = get_model_meta(model_path) | ||||
|     config_path = model_path / "config.cfg" | ||||
|     overrides = dict_to_dot(config) | ||||
|     overrides = dict_to_dot(config, for_overrides=True) | ||||
|     config = load_config(config_path, overrides=overrides) | ||||
|     nlp = load_model_from_config( | ||||
|         config, | ||||
|  | @ -1486,14 +1486,19 @@ def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]: | |||
|     return result | ||||
| 
 | ||||
| 
 | ||||
| def dict_to_dot(obj: Dict[str, dict]) -> Dict[str, Any]: | ||||
| def dict_to_dot(obj: Dict[str, dict], *, for_overrides: bool = False) -> Dict[str, Any]: | ||||
|     """Convert dot notation to a dict. For example: {"token": {"pos": True, | ||||
|     "_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}. | ||||
| 
 | ||||
|     values (Dict[str, dict]): The dict to convert. | ||||
|     obj (Dict[str, dict]): The dict to convert. | ||||
|     for_overrides (bool): Whether to enable special handling for registered | ||||
|         functions in overrides. | ||||
|     RETURNS (Dict[str, Any]): The key/value pairs. | ||||
|     """ | ||||
|     return {".".join(key): value for key, value in walk_dict(obj)} | ||||
|     return { | ||||
|         ".".join(key): value | ||||
|         for key, value in walk_dict(obj, for_overrides=for_overrides) | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
| def dot_to_object(config: Config, section: str): | ||||
|  | @ -1535,13 +1540,20 @@ def set_dot_to_object(config: Config, section: str, value: Any) -> None: | |||
| 
 | ||||
| 
 | ||||
| def walk_dict( | ||||
|     node: Dict[str, Any], parent: List[str] = [] | ||||
|     node: Dict[str, Any], parent: List[str] = [], *, for_overrides: bool = False | ||||
| ) -> Iterator[Tuple[List[str], Any]]: | ||||
|     """Walk a dict and yield the path and values of the leaves.""" | ||||
|     """Walk a dict and yield the path and values of the leaves. | ||||
| 
 | ||||
|     for_overrides (bool): Whether to treat registered functions that start with | ||||
|         @ as final values rather than dicts to traverse. | ||||
|     """ | ||||
|     for key, value in node.items(): | ||||
|         key_parent = [*parent, key] | ||||
|         if isinstance(value, dict): | ||||
|             yield from walk_dict(value, key_parent) | ||||
|         if isinstance(value, dict) and ( | ||||
|             not for_overrides | ||||
|             or not any(value_key.startswith("@") for value_key in value) | ||||
|         ): | ||||
|             yield from walk_dict(value, key_parent, for_overrides=for_overrides) | ||||
|         else: | ||||
|             yield (key_parent, value) | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,10 +1,8 @@ | |||
| cimport numpy as np | ||||
| from cython.operator cimport dereference as deref | ||||
| from libc.stdint cimport uint32_t, uint64_t | ||||
| from libcpp.set cimport set as cppset | ||||
| from murmurhash.mrmr cimport hash128_x64 | ||||
| 
 | ||||
| import functools | ||||
| import warnings | ||||
| from enum import Enum | ||||
| from typing import cast | ||||
|  | @ -15,9 +13,11 @@ from thinc.api import Ops, get_array_module, get_current_ops | |||
| from thinc.backends import get_array_ops | ||||
| from thinc.types import Floats2d | ||||
| 
 | ||||
| from .attrs cimport ORTH, attr_id_t | ||||
| from .strings cimport StringStore | ||||
| 
 | ||||
| from . import util | ||||
| from .attrs import IDS | ||||
| from .errors import Errors, Warnings | ||||
| from .strings import get_string_id | ||||
| 
 | ||||
|  | @ -63,8 +63,9 @@ cdef class Vectors: | |||
|     cdef readonly uint32_t hash_seed | ||||
|     cdef readonly unicode bow | ||||
|     cdef readonly unicode eow | ||||
|     cdef readonly attr_id_t attr | ||||
| 
 | ||||
|     def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"): | ||||
|     def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">", attr="ORTH"): | ||||
|         """Create a new vector store. | ||||
| 
 | ||||
|         strings (StringStore): The string store. | ||||
|  | @ -78,6 +79,8 @@ cdef class Vectors: | |||
|         hash_seed (int): The floret hash seed (default: 0). | ||||
|         bow (str): The floret BOW string (default: "<"). | ||||
|         eow (str): The floret EOW string (default: ">"). | ||||
|         attr (Union[int, str]): The token attribute for the vector keys | ||||
|             (default: "ORTH"). | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#init | ||||
|         """ | ||||
|  | @ -100,10 +103,18 @@ cdef class Vectors: | |||
|         self.hash_seed = hash_seed | ||||
|         self.bow = bow | ||||
|         self.eow = eow | ||||
|         if isinstance(attr, (int, long)): | ||||
|             self.attr = attr | ||||
|         else: | ||||
|             attr = attr.upper() | ||||
|             if attr == "TEXT": | ||||
|                 attr = "ORTH" | ||||
|             self.attr = IDS.get(attr, ORTH) | ||||
| 
 | ||||
|         if self.mode == Mode.default: | ||||
|             if data is None: | ||||
|                 if shape is None: | ||||
|                     shape = (0,0) | ||||
|                     shape = (0, 0) | ||||
|                 ops = get_current_ops() | ||||
|                 data = ops.xp.zeros(shape, dtype="f") | ||||
|                 self._unset = cppset[int]({i for i in range(data.shape[0])}) | ||||
|  | @ -244,11 +255,10 @@ cdef class Vectors: | |||
|     def __eq__(self, other): | ||||
|         # Check for equality, with faster checks first | ||||
|         return ( | ||||
|                 self.shape == other.shape | ||||
|                 and self.key2row == other.key2row | ||||
|                 and self.to_bytes(exclude=["strings"]) | ||||
|                   == other.to_bytes(exclude=["strings"]) | ||||
|                ) | ||||
|             self.shape == other.shape | ||||
|             and self.key2row == other.key2row | ||||
|             and self.to_bytes(exclude=["strings"]) == other.to_bytes(exclude=["strings"]) | ||||
|         ) | ||||
| 
 | ||||
|     def resize(self, shape, inplace=False): | ||||
|         """Resize the underlying vectors array. If inplace=True, the memory | ||||
|  | @ -504,11 +514,12 @@ cdef class Vectors: | |||
|             # vectors e.g. (10000, 300) | ||||
|             # sims    e.g. (1024, 10000) | ||||
|             sims = xp.dot(batch, vectors.T) | ||||
|             best_rows[i:i+batch_size] = xp.argpartition(sims, -n, axis=1)[:,-n:] | ||||
|             scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:,-n:] | ||||
|             best_rows[i:i+batch_size] = xp.argpartition(sims, -n, axis=1)[:, -n:] | ||||
|             scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:, -n:] | ||||
| 
 | ||||
|             if sort and n >= 2: | ||||
|                 sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1] | ||||
|                 sorted_index = xp.arange(scores.shape[0])[:, None][i:i+batch_size], \ | ||||
|                     xp.argsort(scores[i:i+batch_size], axis=1)[:, ::-1] | ||||
|                 scores[i:i+batch_size] = scores[sorted_index] | ||||
|                 best_rows[i:i+batch_size] = best_rows[sorted_index] | ||||
| 
 | ||||
|  | @ -522,8 +533,12 @@ cdef class Vectors: | |||
| 
 | ||||
|         numpy_rows = get_current_ops().to_numpy(best_rows) | ||||
|         keys = xp.asarray( | ||||
|             [[row2key[row] for row in numpy_rows[i] if row in row2key] | ||||
|                     for i in range(len(queries)) ], dtype="uint64") | ||||
|             [ | ||||
|                 [row2key[row] for row in numpy_rows[i] if row in row2key] | ||||
|                 for i in range(len(queries)) | ||||
|             ], | ||||
|             dtype="uint64" | ||||
|         ) | ||||
|         return (keys, best_rows, scores) | ||||
| 
 | ||||
|     def to_ops(self, ops: Ops): | ||||
|  | @ -543,6 +558,7 @@ cdef class Vectors: | |||
|                 "hash_seed": self.hash_seed, | ||||
|                 "bow": self.bow, | ||||
|                 "eow": self.eow, | ||||
|                 "attr": self.attr, | ||||
|             } | ||||
| 
 | ||||
|     def _set_cfg(self, cfg): | ||||
|  | @ -553,6 +569,7 @@ cdef class Vectors: | |||
|         self.hash_seed = cfg.get("hash_seed", 0) | ||||
|         self.bow = cfg.get("bow", "<") | ||||
|         self.eow = cfg.get("eow", ">") | ||||
|         self.attr = cfg.get("attr", ORTH) | ||||
| 
 | ||||
|     def to_disk(self, path, *, exclude=tuple()): | ||||
|         """Save the current state to a directory. | ||||
|  | @ -564,9 +581,9 @@ cdef class Vectors: | |||
|         """ | ||||
|         xp = get_array_module(self.data) | ||||
|         if xp is numpy: | ||||
|             save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) | ||||
|             save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)  # no-cython-lint | ||||
|         else: | ||||
|             save_array = lambda arr, file_: xp.save(file_, arr) | ||||
|             save_array = lambda arr, file_: xp.save(file_, arr)  # no-cython-lint | ||||
| 
 | ||||
|         def save_vectors(path): | ||||
|             # the source of numpy.save indicates that the file object is closed after use. | ||||
|  |  | |||
|  | @ -1,6 +1,4 @@ | |||
| # cython: profile=True | ||||
| from libc.string cimport memcpy | ||||
| 
 | ||||
| import functools | ||||
| 
 | ||||
| import numpy | ||||
|  | @ -19,7 +17,6 @@ from .errors import Errors | |||
| from .lang.lex_attrs import LEX_ATTRS, get_lang, is_stop | ||||
| from .lang.norm_exceptions import BASE_NORMS | ||||
| from .lookups import Lookups | ||||
| from .util import registry | ||||
| from .vectors import Mode as VectorsMode | ||||
| from .vectors import Vectors | ||||
| 
 | ||||
|  | @ -50,8 +47,15 @@ cdef class Vocab: | |||
| 
 | ||||
|     DOCS: https://spacy.io/api/vocab | ||||
|     """ | ||||
|     def __init__(self, lex_attr_getters=None, strings=None, lookups=None, | ||||
|             oov_prob=-20., writing_system=None, get_noun_chunks=None): | ||||
|     def __init__( | ||||
|         self, | ||||
|         lex_attr_getters=None, | ||||
|         strings=None, | ||||
|         lookups=None, | ||||
|         oov_prob=-20., | ||||
|         writing_system=None, | ||||
|         get_noun_chunks=None | ||||
|     ): | ||||
|         """Create the vocabulary. | ||||
| 
 | ||||
|         lex_attr_getters (dict): A dictionary mapping attribute IDs to | ||||
|  | @ -150,7 +154,6 @@ cdef class Vocab: | |||
|         cdef LexemeC* lex | ||||
|         cdef hash_t key = self.strings[string] | ||||
|         lex = <LexemeC*>self._by_orth.get(key) | ||||
|         cdef size_t addr | ||||
|         if lex != NULL: | ||||
|             assert lex.orth in self.strings | ||||
|             if lex.orth != key: | ||||
|  | @ -352,8 +355,13 @@ cdef class Vocab: | |||
|             self[orth] | ||||
|         # Make prob negative so it sorts by rank ascending | ||||
|         # (key2row contains the rank) | ||||
|         priority = [(-lex.prob, self.vectors.key2row[lex.orth], lex.orth) | ||||
|                     for lex in self if lex.orth in self.vectors.key2row] | ||||
|         priority = [] | ||||
|         cdef Lexeme lex | ||||
|         cdef attr_t value | ||||
|         for lex in self: | ||||
|             value = Lexeme.get_struct_attr(lex.c, self.vectors.attr) | ||||
|             if value in self.vectors.key2row: | ||||
|                 priority.append((-lex.prob, self.vectors.key2row[value], value)) | ||||
|         priority.sort() | ||||
|         indices = xp.asarray([i for (prob, i, key) in priority], dtype="uint64") | ||||
|         keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64") | ||||
|  | @ -386,8 +394,10 @@ cdef class Vocab: | |||
|         """ | ||||
|         if isinstance(orth, str): | ||||
|             orth = self.strings.add(orth) | ||||
|         if self.has_vector(orth): | ||||
|             return self.vectors[orth] | ||||
|         cdef Lexeme lex = self[orth] | ||||
|         key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) | ||||
|         if self.has_vector(key): | ||||
|             return self.vectors[key] | ||||
|         xp = get_array_module(self.vectors.data) | ||||
|         vectors = xp.zeros((self.vectors_length,), dtype="f") | ||||
|         return vectors | ||||
|  | @ -403,15 +413,16 @@ cdef class Vocab: | |||
|         """ | ||||
|         if isinstance(orth, str): | ||||
|             orth = self.strings.add(orth) | ||||
|         if self.vectors.is_full and orth not in self.vectors: | ||||
|         cdef Lexeme lex = self[orth] | ||||
|         key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) | ||||
|         if self.vectors.is_full and key not in self.vectors: | ||||
|             new_rows = max(100, int(self.vectors.shape[0]*1.3)) | ||||
|             if self.vectors.shape[1] == 0: | ||||
|                 width = vector.size | ||||
|             else: | ||||
|                 width = self.vectors.shape[1] | ||||
|             self.vectors.resize((new_rows, width)) | ||||
|         lex = self[orth]  # Add word to vocab if necessary | ||||
|         row = self.vectors.add(orth, vector=vector) | ||||
|         row = self.vectors.add(key, vector=vector) | ||||
|         if row >= 0: | ||||
|             lex.rank = row | ||||
| 
 | ||||
|  | @ -426,7 +437,9 @@ cdef class Vocab: | |||
|         """ | ||||
|         if isinstance(orth, str): | ||||
|             orth = self.strings.add(orth) | ||||
|         return orth in self.vectors | ||||
|         cdef Lexeme lex = self[orth] | ||||
|         key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) | ||||
|         return key in self.vectors | ||||
| 
 | ||||
|     property lookups: | ||||
|         def __get__(self): | ||||
|  | @ -440,7 +453,6 @@ cdef class Vocab: | |||
|                     self.lookups.get_table("lexeme_norm"), | ||||
|                 ) | ||||
| 
 | ||||
| 
 | ||||
|     def to_disk(self, path, *, exclude=tuple()): | ||||
|         """Save the current state to a directory. | ||||
| 
 | ||||
|  | @ -453,7 +465,6 @@ cdef class Vocab: | |||
|         path = util.ensure_path(path) | ||||
|         if not path.exists(): | ||||
|             path.mkdir() | ||||
|         setters = ["strings", "vectors"] | ||||
|         if "strings" not in exclude: | ||||
|             self.strings.to_disk(path / "strings.json") | ||||
|         if "vectors" not in exclude: | ||||
|  | @ -472,7 +483,6 @@ cdef class Vocab: | |||
|         DOCS: https://spacy.io/api/vocab#to_disk | ||||
|         """ | ||||
|         path = util.ensure_path(path) | ||||
|         getters = ["strings", "vectors"] | ||||
|         if "strings" not in exclude: | ||||
|             self.strings.from_disk(path / "strings.json")  # TODO: add exclude? | ||||
|         if "vectors" not in exclude: | ||||
|  |  | |||
|  | @ -303,7 +303,7 @@ mapped to a zero vector. See the documentation on | |||
| | `nM`        | The width of the static vectors. ~~Optional[int]~~                                                                                                                                                                      | | ||||
| | `dropout`   | Optional dropout rate. If set, it's applied per dimension over the whole batch. Defaults to `None`. ~~Optional[float]~~                                                                                                 | | ||||
| | `init_W`    | The [initialization function](https://thinc.ai/docs/api-initializers). Defaults to [`glorot_uniform_init`](https://thinc.ai/docs/api-initializers#glorot_uniform_init). ~~Callable[[Ops, Tuple[int, ...]]], FloatsXd]~~ | | ||||
| | `key_attr`  | Defaults to `"ORTH"`. ~~str~~                                                                                                                                                                                           | | ||||
| | `key_attr`  | This setting is ignored in spaCy v3.6+. To set a custom key attribute for vectors, configure it through [`Vectors`](/api/vectors) or [`spacy init vectors`](/api/cli#init-vectors). Defaults to `"ORTH"`. ~~str~~       | | ||||
| | **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~                                                                                                                                                          | | ||||
| 
 | ||||
| ### spacy.FeatureExtractor.v1 {id="FeatureExtractor"} | ||||
|  |  | |||
|  | @ -876,7 +876,7 @@ token-to-vector embedding component like [`Tok2Vec`](/api/tok2vec) or | |||
| training a pipeline with components sourced from an existing pipeline: if | ||||
| multiple components (e.g. tagger, parser, NER) listen to the same | ||||
| token-to-vector component, but some of them are frozen and not updated, their | ||||
| performance may degrade significally as the token-to-vector component is updated | ||||
| performance may degrade significantly as the token-to-vector component is updated | ||||
| with new data. To prevent this, listeners can be replaced with a standalone | ||||
| token-to-vector layer that is owned by the component and doesn't change if the | ||||
| component isn't updated. | ||||
|  |  | |||
|  | @ -60,7 +60,7 @@ architectures and their arguments and hyperparameters. | |||
| | `model`      | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~                                                                                           | | ||||
| | `spans_key`  | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | | ||||
| | `threshold`  | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~                                                                                                                                    | | ||||
| | `max_length` | Maximum length of the produced spans, defaults to `None` meaning unlimited length. ~~Optional[int]~~                                                                                                                   | | ||||
| | `max_length` | Maximum length of the produced spans, defaults to `25`. ~~Optional[int]~~                                                                                                                                              | | ||||
| | `min_length` | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~                                                                                                          | | ||||
| | `scorer`     | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~                                                      | | ||||
| 
 | ||||
|  |  | |||
|  | @ -59,6 +59,7 @@ modified later. | |||
| | `hash_seed` <Tag variant="new">3.2</Tag>  | The floret hash seed (default: `0`). ~~int~~                                                                                                                                           | | ||||
| | `bow` <Tag variant="new">3.2</Tag>        | The floret BOW string (default: `"<"`). ~~str~~                                                                                                                                        | | ||||
| | `eow` <Tag variant="new">3.2</Tag>        | The floret EOW string (default: `">"`). ~~str~~                                                                                                                                        | | ||||
| | `attr` <Tag variant="new">3.6</Tag>       | The token attribute for the vector keys (default: `"ORTH"`). ~~Union[int, str]~~                                                                                                       | | ||||
| 
 | ||||
| ## Vectors.\_\_getitem\_\_ {id="getitem",tag="method"} | ||||
| 
 | ||||
|  | @ -452,8 +453,9 @@ Load state from a binary string. | |||
| 
 | ||||
| ## Attributes {id="attributes"} | ||||
| 
 | ||||
| | Name      | Description                                                                                                                                                          | | ||||
| | --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `data`    | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~   | | ||||
| | `key2row` | Dictionary mapping word hashes to rows in the `Vectors.data` table. ~~Dict[int, int]~~                                                                               | | ||||
| | `keys`    | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ | | ||||
| | Name                                | Description                                                                                                                                                          | | ||||
| | ----------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `data`                              | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~   | | ||||
| | `key2row`                           | Dictionary mapping word hashes to rows in the `Vectors.data` table. ~~Dict[int, int]~~                                                                               | | ||||
| | `keys`                              | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ | | ||||
| | `attr` <Tag variant="new">3.6</Tag> | The token attribute for the vector keys. ~~int~~                                                                                                                     | | ||||
|  |  | |||
|  | @ -113,7 +113,7 @@ print(doc[2].morph)  # 'Case=Nom|Person=2|PronType=Prs' | |||
| print(doc[2].pos_)  # 'PRON' | ||||
| ``` | ||||
| 
 | ||||
| ## Lemmatization {id="lemmatization",model="lemmatizer",version="3"} | ||||
| ## Lemmatization {id="lemmatization",version="3"} | ||||
| 
 | ||||
| spaCy provides two pipeline components for lemmatization: | ||||
| 
 | ||||
|  | @ -170,7 +170,7 @@ nlp = spacy.blank("sv") | |||
| nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) | ||||
| ``` | ||||
| 
 | ||||
| ### Rule-based lemmatizer {id="lemmatizer-rule"} | ||||
| ### Rule-based lemmatizer {id="lemmatizer-rule",model="morphologizer"} | ||||
| 
 | ||||
| When training pipelines that include a component that assigns part-of-speech | ||||
| tags (a morphologizer or a tagger with a [POS mapping](#mappings-exceptions)), a | ||||
|  | @ -194,7 +194,7 @@ information, without consulting the context of the token. The rule-based | |||
| lemmatizer also accepts list-based exception files. For English, these are | ||||
| acquired from [WordNet](https://wordnet.princeton.edu/). | ||||
| 
 | ||||
| ### Trainable lemmatizer | ||||
| ### Trainable lemmatizer {id="lemmatizer-train",model="trainable_lemmatizer"} | ||||
| 
 | ||||
| The [`EditTreeLemmatizer`](/api/edittreelemmatizer) can learn form-to-lemma | ||||
| transformations from a training corpus that includes lemma annotations. This | ||||
|  |  | |||
|  | @ -11,7 +11,6 @@ menu: | |||
|   - ['Custom Functions', 'custom-functions'] | ||||
|   - ['Initialization', 'initialization'] | ||||
|   - ['Data Utilities', 'data'] | ||||
|   - ['Parallel Training', 'parallel-training'] | ||||
|   - ['Internal API', 'api'] | ||||
| --- | ||||
| 
 | ||||
|  | @ -1565,77 +1564,6 @@ token-based annotations like the dependency parse or entity labels, you'll need | |||
| to take care to adjust the `Example` object so its annotations match and remain | ||||
| valid. | ||||
| 
 | ||||
| ## Parallel & distributed training with Ray {id="parallel-training"} | ||||
| 
 | ||||
| > #### Installation | ||||
| > | ||||
| > ```bash | ||||
| > $ pip install -U %%SPACY_PKG_NAME[ray]%%SPACY_PKG_FLAGS | ||||
| > # Check that the CLI is registered | ||||
| > $ python -m spacy ray --help | ||||
| > ``` | ||||
| 
 | ||||
| [Ray](https://ray.io/) is a fast and simple framework for building and running | ||||
| **distributed applications**. You can use Ray to train spaCy on one or more | ||||
| remote machines, potentially speeding up your training process. Parallel | ||||
| training won't always be faster though – it depends on your batch size, models, | ||||
| and hardware. | ||||
| 
 | ||||
| <Infobox variant="warning"> | ||||
| 
 | ||||
| To use Ray with spaCy, you need the | ||||
| [`spacy-ray`](https://github.com/explosion/spacy-ray) package installed. | ||||
| Installing the package will automatically add the `ray` command to the spaCy | ||||
| CLI. | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| The [`spacy ray train`](/api/cli#ray-train) command follows the same API as | ||||
| [`spacy train`](/api/cli#train), with a few extra options to configure the Ray | ||||
| setup. You can optionally set the `--address` option to point to your Ray | ||||
| cluster. If it's not set, Ray will run locally. | ||||
| 
 | ||||
| ```bash | ||||
| python -m spacy ray train config.cfg --n-workers 2 | ||||
| ``` | ||||
| 
 | ||||
| <Project id="integrations/ray"> | ||||
| 
 | ||||
| Get started with parallel training using our project template. It trains a | ||||
| simple model on a Universal Dependencies Treebank and lets you parallelize the | ||||
| training with Ray. | ||||
| 
 | ||||
| </Project> | ||||
| 
 | ||||
| ### How parallel training works {id="parallel-training-details"} | ||||
| 
 | ||||
| Each worker receives a shard of the **data** and builds a copy of the **model | ||||
| and optimizer** from the [`config.cfg`](#config). It also has a communication | ||||
| channel to **pass gradients and parameters** to the other workers. Additionally, | ||||
| each worker is given ownership of a subset of the parameter arrays. Every | ||||
| parameter array is owned by exactly one worker, and the workers are given a | ||||
| mapping so they know which worker owns which parameter. | ||||
| 
 | ||||
|  | ||||
| 
 | ||||
| As training proceeds, every worker will be computing gradients for **all** of | ||||
| the model parameters. When they compute gradients for parameters they don't own, | ||||
| they'll **send them to the worker** that does own that parameter, along with a | ||||
| version identifier so that the owner can decide whether to discard the gradient. | ||||
| Workers use the gradients they receive and the ones they compute locally to | ||||
| update the parameters they own, and then broadcast the updated array and a new | ||||
| version ID to the other workers. | ||||
| 
 | ||||
| This training procedure is **asynchronous** and **non-blocking**. Workers always | ||||
| push their gradient increments and parameter updates, they do not have to pull | ||||
| them and block on the result, so the transfers can happen in the background, | ||||
| overlapped with the actual training work. The workers also do not have to stop | ||||
| and wait for each other ("synchronize") at the start of each batch. This is very | ||||
| useful for spaCy, because spaCy is often trained on long documents, which means | ||||
| **batches can vary in size** significantly. Uneven workloads make synchronous | ||||
| gradient descent inefficient, because if one batch is slow, all of the other | ||||
| workers are stuck waiting for it to complete before they can continue. | ||||
| 
 | ||||
| ## Internal training API {id="api"} | ||||
| 
 | ||||
| <Infobox variant="danger"> | ||||
|  |  | |||
							
								
								
									
										143
									
								
								website/docs/usage/v3-6.mdx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										143
									
								
								website/docs/usage/v3-6.mdx
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,143 @@ | |||
| --- | ||||
| title: What's New in v3.6 | ||||
| teaser: New features and how to upgrade | ||||
| menu: | ||||
|   - ['New Features', 'features'] | ||||
|   - ['Upgrading Notes', 'upgrading'] | ||||
| --- | ||||
| 
 | ||||
| ## New features {id="features",hidden="true"} | ||||
| 
 | ||||
| spaCy v3.6 adds the new [`SpanFinder`](/api/spanfinder) component to the core | ||||
| spaCy library and new trained pipelines for Slovenian. | ||||
| 
 | ||||
| ### SpanFinder {id="spanfinder"} | ||||
| 
 | ||||
| The [`SpanFinder`](/api/spanfinder) component identifies potentially | ||||
| overlapping, unlabeled spans by identifying span start and end tokens. It is | ||||
| intended for use in combination with a component like | ||||
| [`SpanCategorizer`](/api/spancategorizer) that may further filter or label the | ||||
| spans. See our | ||||
| [Spancat blog post](https://explosion.ai/blog/spancat#span-finder) for a more | ||||
| detailed introduction to the span finder. | ||||
| 
 | ||||
| To train a pipeline with `span_finder` + `spancat`, remember to add | ||||
| `span_finder` (and its `tok2vec` or `transformer` if required) to | ||||
| `[training.annotating_components]` so that the `spancat` component can be | ||||
| trained directly from its predictions: | ||||
| 
 | ||||
| ```ini | ||||
| [nlp] | ||||
| pipeline = ["tok2vec","span_finder","spancat"] | ||||
| 
 | ||||
| [training] | ||||
| annotating_components = ["tok2vec","span_finder"] | ||||
| ``` | ||||
| 
 | ||||
| In practice it can be helpful to initially train the `span_finder` separately | ||||
| before [sourcing](/usage/processing-pipelines#sourced-components) it (along with | ||||
| its `tok2vec`) into the `spancat` pipeline for further training. Otherwise the | ||||
| memory usage can spike for `spancat` in the first few training steps if the | ||||
| `span_finder` makes a large number of predictions. | ||||
| 
 | ||||
| ### Additional features and improvements {id="additional-features-and-improvements"} | ||||
| 
 | ||||
| - Language updates: | ||||
|   - Add initial support for Malay. | ||||
|   - Update Latin defaults to support noun chunks, update lexical/tokenizer | ||||
|     settings and add example sentences. | ||||
| - Support `spancat_singlelabel` in `spacy debug data` CLI. | ||||
| - Add `doc.spans` rendering to `spacy evaluate` CLI displaCy output. | ||||
| - Support custom token/lexeme attribute for vectors. | ||||
| - Add option to return scores separately keyed by component name with | ||||
|   `spacy evaluate --per-component`, `Language.evaluate(per_component=True)` and | ||||
|   `Scorer.score(per_component=True)`. This is useful when the pipeline contains | ||||
|   more than one of the same component like `textcat` that may have overlapping | ||||
|   scores keys. | ||||
| - Typing updates for `PhraseMatcher` and `SpanGroup`. | ||||
| 
 | ||||
| ## Trained pipelines {id="pipelines"} | ||||
| 
 | ||||
| ### New trained pipelines {id="new-pipelines"} | ||||
| 
 | ||||
| v3.6 introduces new pipelines for Slovenian, which use the trainable lemmatizer | ||||
| and [floret vectors](https://github.com/explosion/floret). | ||||
| 
 | ||||
| | Package                                           | UPOS | Parser LAS | NER F | | ||||
| | ------------------------------------------------- | ---: | ---------: | ----: | | ||||
| | [`sl_core_news_sm`](/models/sl#sl_core_news_sm)   | 96.9 |       82.1 |  62.9 | | ||||
| | [`sl_core_news_md`](/models/sl#sl_core_news_md)   | 97.6 |       84.3 |  73.5 | | ||||
| | [`sl_core_news_lg`](/models/sl#sl_core_news_lg)   | 97.7 |       84.3 |  79.0 | | ||||
| | [`sl_core_news_trf`](/models/sl#sl_core_news_trf) | 99.0 |       91.7 |  90.0 | | ||||
| 
 | ||||
| ### Pipeline updates {id="pipeline-updates"} | ||||
| 
 | ||||
| The English pipelines have been updated to improve handling of contractions with | ||||
| various apostrophes and to lemmatize "get" as a passive auxiliary. | ||||
| 
 | ||||
| The Danish pipeline `da_core_news_trf` has been updated to use | ||||
| [`vesteinn/DanskBERT`](https://huggingface.co/vesteinn/DanskBERT) with | ||||
| performance improvements across the board. | ||||
| 
 | ||||
| ## Notes about upgrading from v3.5 {id="upgrading"} | ||||
| 
 | ||||
| ### SpanGroup spans are now required to be from the same doc {id="spangroup-spans"} | ||||
| 
 | ||||
| When initializing a `SpanGroup`, there is a new check to verify that all added | ||||
| spans refer to the current doc. Without this check, it was possible to run into | ||||
| string store or other errors. | ||||
| 
 | ||||
| One place this may crop up is when creating `Example` objects for training with | ||||
| custom spans: | ||||
| 
 | ||||
| ```diff | ||||
|      doc = Doc(nlp.vocab, words=tokens)  # predicted doc | ||||
|      example = Example.from_dict(doc, {"ner": iob_tags}) | ||||
|      # use the reference doc when creating reference spans | ||||
| -    span = Span(doc, 0, 5, "ORG") | ||||
| +    span = Span(example.reference, 0, 5, "ORG") | ||||
|      example.reference.spans[spans_key] = [span] | ||||
| ``` | ||||
| 
 | ||||
| ### Pipeline package version compatibility {id="version-compat"} | ||||
| 
 | ||||
| > #### Using legacy implementations | ||||
| > | ||||
| > In spaCy v3, you'll still be able to load and reference legacy implementations | ||||
| > via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the | ||||
| > components or architectures change and newer versions are available in the | ||||
| > core library. | ||||
| 
 | ||||
| When you're loading a pipeline package trained with an earlier version of spaCy | ||||
| v3, you will see a warning telling you that the pipeline may be incompatible. | ||||
| This doesn't necessarily have to be true, but we recommend running your | ||||
| pipelines against your test suite or evaluation data to make sure there are no | ||||
| unexpected results. | ||||
| 
 | ||||
| If you're using one of the [trained pipelines](/models) we provide, you should | ||||
| run [`spacy download`](/api/cli#download) to update to the latest version. To | ||||
| see an overview of all installed packages and their compatibility, you can run | ||||
| [`spacy validate`](/api/cli#validate). | ||||
| 
 | ||||
| If you've trained your own custom pipeline and you've confirmed that it's still | ||||
| working as expected, you can update the spaCy version requirements in the | ||||
| [`meta.json`](/api/data-formats#meta): | ||||
| 
 | ||||
| ```diff | ||||
| - "spacy_version": ">=3.5.0,<3.6.0", | ||||
| + "spacy_version": ">=3.5.0,<3.7.0", | ||||
| ``` | ||||
| 
 | ||||
| ### Updating v3.5 configs | ||||
| 
 | ||||
| To update a config from spaCy v3.5 with the new v3.6 settings, run | ||||
| [`init fill-config`](/api/cli#init-fill-config): | ||||
| 
 | ||||
| ```cli | ||||
| $ python -m spacy init fill-config config-v3.5.cfg config-v3.6.cfg | ||||
| ``` | ||||
| 
 | ||||
| In many cases ([`spacy train`](/api/cli#train), | ||||
| [`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in | ||||
| automatically, but you'll need to fill in the new settings to run | ||||
| [`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data). | ||||
|  | @ -222,7 +222,9 @@ | |||
|         }, | ||||
|         { | ||||
|             "code": "la", | ||||
|             "name": "Latin" | ||||
|             "name": "Latin", | ||||
| 	    "example": "In principio creavit Deus caelum et terram.", | ||||
| 	    "has_examples": true | ||||
|         }, | ||||
|         { | ||||
|             "code": "lb", | ||||
|  | @ -339,7 +341,10 @@ | |||
|         }, | ||||
|         { | ||||
|             "code": "sl", | ||||
|             "name": "Slovenian" | ||||
|             "name": "Slovenian", | ||||
| 	    "example": "France Prešeren je umrl 8. februarja 1849 v Kranju", | ||||
| 	    "has_examples": true, | ||||
|             "models": ["sl_core_news_sm", "sl_core_news_md", "sl_core_news_lg", "sl_core_news_trf"] | ||||
|         }, | ||||
|         { | ||||
|             "code": "sq", | ||||
|  |  | |||
|  | @ -14,7 +14,8 @@ | |||
|                     { "text": "New in v3.2", "url": "/usage/v3-2" }, | ||||
|                     { "text": "New in v3.3", "url": "/usage/v3-3" }, | ||||
|                     { "text": "New in v3.4", "url": "/usage/v3-4" }, | ||||
|                     { "text": "New in v3.5", "url": "/usage/v3-5" } | ||||
|                     { "text": "New in v3.5", "url": "/usage/v3-5" }, | ||||
|                     { "text": "New in v3.6", "url": "/usage/v3-6" } | ||||
|                 ] | ||||
|             }, | ||||
|             { | ||||
|  |  | |||
|  | @ -27,7 +27,7 @@ | |||
|         "indexName": "spacy" | ||||
|     }, | ||||
|     "binderUrl": "explosion/spacy-io-binder", | ||||
|     "binderVersion": "3.5", | ||||
|     "binderVersion": "3.6", | ||||
|     "sections": [ | ||||
|         { "id": "usage", "title": "Usage Documentation", "theme": "blue" }, | ||||
|         { "id": "models", "title": "Models Documentation", "theme": "blue" }, | ||||
|  |  | |||
|  | @ -4376,7 +4376,7 @@ | |||
|             "code_example": [ | ||||
|                 "import spacy", | ||||
|                 "", | ||||
|                 "nlp = spacy.load(\"en_core_web_sm\", disable=[\"ner\"])", | ||||
|                 "nlp = spacy.load(\"en_core_web_sm\", exclude=[\"ner\"])", | ||||
|                 "nlp.add_pipe(\"span_marker\", config={\"model\": \"tomaarsen/span-marker-roberta-large-ontonotes5\"})", | ||||
|                 "", | ||||
|                 "text = \"\"\"Cleopatra VII, also known as Cleopatra the Great, was the last active ruler of the \\", | ||||
|  |  | |||
|  | @ -13,6 +13,8 @@ import 'prismjs/components/prism-json.min.js' | |||
| import 'prismjs/components/prism-markdown.min.js' | ||||
| import 'prismjs/components/prism-python.min.js' | ||||
| import 'prismjs/components/prism-yaml.min.js' | ||||
| import 'prismjs/components/prism-docker.min.js' | ||||
| import 'prismjs/components/prism-r.min.js' | ||||
| 
 | ||||
| import { isString } from './util' | ||||
| import Link, { OptionalLink } from './link' | ||||
|  | @ -172,7 +174,7 @@ const convertLine = ({ line, prompt, lang }) => { | |||
|         return handlePromot({ lineFlat, prompt }) | ||||
|     } | ||||
| 
 | ||||
|     return lang === 'none' || !lineFlat ? ( | ||||
|     return lang === 'none' || !lineFlat || !(lang in Prism.languages) ? ( | ||||
|         lineFlat | ||||
|     ) : ( | ||||
|         <span | ||||
|  |  | |||
|  | @ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => { | |||
| } | ||||
| 
 | ||||
| const navAlert = ( | ||||
|     <Link to="/usage/v3-5" noLinkLayout> | ||||
|         <strong>💥 Out now:</strong> spaCy v3.5 | ||||
|     <Link to="/usage/v3-6" noLinkLayout> | ||||
|         <strong>💥 Out now:</strong> spaCy v3.6 | ||||
|     </Link> | ||||
| ) | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user