mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'upstream_master' into sync_develop
This commit is contained in:
		
						commit
						79ec68f01b
					
				
							
								
								
									
										6
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -45,6 +45,12 @@ jobs:
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          python -m pip install flake8==5.0.4
 | 
					          python -m pip install flake8==5.0.4
 | 
				
			||||||
          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
 | 
					          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
 | 
				
			||||||
 | 
					      - name: cython-lint
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          python -m pip install cython-lint -c requirements.txt
 | 
				
			||||||
 | 
					          # E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
 | 
				
			||||||
 | 
					          cython-lint spacy --ignore E501,W291,E266
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  tests:
 | 
					  tests:
 | 
				
			||||||
    name: Test
 | 
					    name: Test
 | 
				
			||||||
    needs: Validate
 | 
					    needs: Validate
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										4
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								Makefile
									
									
									
									
									
								
							| 
						 | 
					@ -1,11 +1,11 @@
 | 
				
			||||||
SHELL := /bin/bash
 | 
					SHELL := /bin/bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ifndef SPACY_EXTRAS
 | 
					ifndef SPACY_EXTRAS
 | 
				
			||||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2
 | 
					override SPACY_EXTRAS = spacy-lookups-data==1.0.3
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ifndef PYVER
 | 
					ifndef PYVER
 | 
				
			||||||
override PYVER = 3.6
 | 
					override PYVER = 3.8
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
VENV := ./env$(PYVER)
 | 
					VENV := ./env$(PYVER)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -39,4 +39,5 @@ types-setuptools>=57.0.0
 | 
				
			||||||
types-requests
 | 
					types-requests
 | 
				
			||||||
types-setuptools>=57.0.0
 | 
					types-setuptools>=57.0.0
 | 
				
			||||||
black==22.3.0
 | 
					black==22.3.0
 | 
				
			||||||
 | 
					cython-lint>=0.15.0; python_version >= "3.7"
 | 
				
			||||||
isort>=5.0,<6.0
 | 
					isort>=5.0,<6.0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -117,7 +117,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
 | 
				
			||||||
        if "pos" in stringy_attrs:
 | 
					        if "pos" in stringy_attrs:
 | 
				
			||||||
            stringy_attrs["TAG"] = stringy_attrs.pop("pos")
 | 
					            stringy_attrs["TAG"] = stringy_attrs.pop("pos")
 | 
				
			||||||
        if "morph" in stringy_attrs:
 | 
					        if "morph" in stringy_attrs:
 | 
				
			||||||
            morphs = stringy_attrs.pop("morph")
 | 
					            morphs = stringy_attrs.pop("morph")  # no-cython-lint
 | 
				
			||||||
        if "number" in stringy_attrs:
 | 
					        if "number" in stringy_attrs:
 | 
				
			||||||
            stringy_attrs.pop("number")
 | 
					            stringy_attrs.pop("number")
 | 
				
			||||||
        if "tenspect" in stringy_attrs:
 | 
					        if "tenspect" in stringy_attrs:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,3 @@
 | 
				
			||||||
import itertools
 | 
					 | 
				
			||||||
import uuid
 | 
					import uuid
 | 
				
			||||||
from typing import Any, Dict, List, Optional, Tuple, Union
 | 
					from typing import Any, Dict, List, Optional, Tuple, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -218,7 +217,7 @@ class SpanRenderer:
 | 
				
			||||||
                    + (self.offset_step * (len(entities) - 1))
 | 
					                    + (self.offset_step * (len(entities) - 1))
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
                markup += self.span_template.format(
 | 
					                markup += self.span_template.format(
 | 
				
			||||||
                    text=token["text"],
 | 
					                    text=escape_html(token["text"]),
 | 
				
			||||||
                    span_slices=slices,
 | 
					                    span_slices=slices,
 | 
				
			||||||
                    span_starts=starts,
 | 
					                    span_starts=starts,
 | 
				
			||||||
                    total_height=total_height,
 | 
					                    total_height=total_height,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,7 +4,8 @@ from ..typedefs cimport hash_t
 | 
				
			||||||
from .kb cimport KnowledgeBase
 | 
					from .kb cimport KnowledgeBase
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
 | 
					# Object used by the Entity Linker that summarizes one entity-alias candidate
 | 
				
			||||||
 | 
					# combination.
 | 
				
			||||||
cdef class Candidate:
 | 
					cdef class Candidate:
 | 
				
			||||||
    cdef readonly KnowledgeBase kb
 | 
					    cdef readonly KnowledgeBase kb
 | 
				
			||||||
    cdef hash_t entity_hash
 | 
					    cdef hash_t entity_hash
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,15 +8,24 @@ from ..tokens import Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Candidate:
 | 
					cdef class Candidate:
 | 
				
			||||||
    """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
 | 
					    """A `Candidate` object refers to a textual mention (`alias`) that may or
 | 
				
			||||||
    to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
 | 
					    may not be resolved to a specific `entity` from a Knowledge Base. This
 | 
				
			||||||
    algorithm which will disambiguate the various candidates to the correct one.
 | 
					    will be used as input for the entity linking algorithm which will
 | 
				
			||||||
 | 
					    disambiguate the various candidates to the correct one.
 | 
				
			||||||
    Each candidate (alias, entity) pair is assigned a certain prior probability.
 | 
					    Each candidate (alias, entity) pair is assigned a certain prior probability.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DOCS: https://spacy.io/api/kb/#candidate-init
 | 
					    DOCS: https://spacy.io/api/kb/#candidate-init
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
 | 
					    def __init__(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        KnowledgeBase kb,
 | 
				
			||||||
 | 
					        entity_hash,
 | 
				
			||||||
 | 
					        entity_freq,
 | 
				
			||||||
 | 
					        entity_vector,
 | 
				
			||||||
 | 
					        alias_hash,
 | 
				
			||||||
 | 
					        prior_prob
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
        self.kb = kb
 | 
					        self.kb = kb
 | 
				
			||||||
        self.entity_hash = entity_hash
 | 
					        self.entity_hash = entity_hash
 | 
				
			||||||
        self.entity_freq = entity_freq
 | 
					        self.entity_freq = entity_freq
 | 
				
			||||||
| 
						 | 
					@ -59,7 +68,8 @@ cdef class Candidate:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
 | 
					def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Return candidate entities for a given mention and fetching appropriate entries from the index.
 | 
					    Return candidate entities for a given mention and fetching appropriate
 | 
				
			||||||
 | 
					    entries from the index.
 | 
				
			||||||
    kb (KnowledgeBase): Knowledge base to query.
 | 
					    kb (KnowledgeBase): Knowledge base to query.
 | 
				
			||||||
    mention (Span): Entity mention for which to identify candidates.
 | 
					    mention (Span): Entity mention for which to identify candidates.
 | 
				
			||||||
    RETURNS (Iterable[Candidate]): Identified candidates.
 | 
					    RETURNS (Iterable[Candidate]): Identified candidates.
 | 
				
			||||||
| 
						 | 
					@ -67,9 +77,12 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
 | 
				
			||||||
    return kb.get_candidates(mention)
 | 
					    return kb.get_candidates(mention)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
 | 
					def get_candidates_batch(
 | 
				
			||||||
 | 
					        kb: KnowledgeBase, mentions: Iterable[Span]
 | 
				
			||||||
 | 
					) -> Iterable[Iterable[Candidate]]:
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Return candidate entities for the given mentions and fetching appropriate entries from the index.
 | 
					    Return candidate entities for the given mentions and fetching appropriate entries
 | 
				
			||||||
 | 
					    from the index.
 | 
				
			||||||
    kb (KnowledgeBase): Knowledge base to query.
 | 
					    kb (KnowledgeBase): Knowledge base to query.
 | 
				
			||||||
    mention (Iterable[Span]): Entity mentions for which to identify candidates.
 | 
					    mention (Iterable[Span]): Entity mentions for which to identify candidates.
 | 
				
			||||||
    RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
 | 
					    RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,8 +12,9 @@ from .candidate import Candidate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class KnowledgeBase:
 | 
					cdef class KnowledgeBase:
 | 
				
			||||||
    """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
 | 
					    """A `KnowledgeBase` instance stores unique identifiers for entities and
 | 
				
			||||||
    to support entity linking of named entities to real-world concepts.
 | 
					    their textual aliases, to support entity linking of named entities to
 | 
				
			||||||
 | 
					    real-world concepts.
 | 
				
			||||||
    This is an abstract class and requires its operations to be implemented.
 | 
					    This is an abstract class and requires its operations to be implemented.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DOCS: https://spacy.io/api/kb
 | 
					    DOCS: https://spacy.io/api/kb
 | 
				
			||||||
| 
						 | 
					@ -31,10 +32,13 @@ cdef class KnowledgeBase:
 | 
				
			||||||
        self.entity_vector_length = entity_vector_length
 | 
					        self.entity_vector_length = entity_vector_length
 | 
				
			||||||
        self.mem = Pool()
 | 
					        self.mem = Pool()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
 | 
					    def get_candidates_batch(
 | 
				
			||||||
 | 
					        self, mentions: Iterable[Span]
 | 
				
			||||||
 | 
					    ) -> Iterable[Iterable[Candidate]]:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
 | 
					        Return candidate entities for specified texts. Each candidate defines
 | 
				
			||||||
        and the prior probability of that alias resolving to that entity.
 | 
					        the entity, the original alias, and the prior probability of that
 | 
				
			||||||
 | 
					        alias resolving to that entity.
 | 
				
			||||||
        If no candidate is found for a given text, an empty list is returned.
 | 
					        If no candidate is found for a given text, an empty list is returned.
 | 
				
			||||||
        mentions (Iterable[Span]): Mentions for which to get candidates.
 | 
					        mentions (Iterable[Span]): Mentions for which to get candidates.
 | 
				
			||||||
        RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
 | 
					        RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
 | 
				
			||||||
| 
						 | 
					@ -43,14 +47,17 @@ cdef class KnowledgeBase:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
 | 
					    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Return candidate entities for specified text. Each candidate defines the entity, the original alias,
 | 
					        Return candidate entities for specified text. Each candidate defines
 | 
				
			||||||
 | 
					        the entity, the original alias,
 | 
				
			||||||
        and the prior probability of that alias resolving to that entity.
 | 
					        and the prior probability of that alias resolving to that entity.
 | 
				
			||||||
        If the no candidate is found for a given text, an empty list is returned.
 | 
					        If the no candidate is found for a given text, an empty list is returned.
 | 
				
			||||||
        mention (Span): Mention for which to get candidates.
 | 
					        mention (Span): Mention for which to get candidates.
 | 
				
			||||||
        RETURNS (Iterable[Candidate]): Identified candidates.
 | 
					        RETURNS (Iterable[Candidate]): Identified candidates.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        raise NotImplementedError(
 | 
					        raise NotImplementedError(
 | 
				
			||||||
            Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
 | 
					            Errors.E1045.format(
 | 
				
			||||||
 | 
					                parent="KnowledgeBase", method="get_candidates", name=self.__name__
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
 | 
					    def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
 | 
				
			||||||
| 
						 | 
					@ -68,7 +75,9 @@ cdef class KnowledgeBase:
 | 
				
			||||||
        RETURNS (Iterable[float]): Vector for specified entity.
 | 
					        RETURNS (Iterable[float]): Vector for specified entity.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        raise NotImplementedError(
 | 
					        raise NotImplementedError(
 | 
				
			||||||
            Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
 | 
					            Errors.E1045.format(
 | 
				
			||||||
 | 
					                parent="KnowledgeBase", method="get_vector", name=self.__name__
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def to_bytes(self, **kwargs) -> bytes:
 | 
					    def to_bytes(self, **kwargs) -> bytes:
 | 
				
			||||||
| 
						 | 
					@ -76,7 +85,9 @@ cdef class KnowledgeBase:
 | 
				
			||||||
        RETURNS (bytes): Current state as binary string.
 | 
					        RETURNS (bytes): Current state as binary string.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        raise NotImplementedError(
 | 
					        raise NotImplementedError(
 | 
				
			||||||
            Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
 | 
					            Errors.E1045.format(
 | 
				
			||||||
 | 
					                parent="KnowledgeBase", method="to_bytes", name=self.__name__
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
 | 
					    def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
 | 
				
			||||||
| 
						 | 
					@ -85,25 +96,35 @@ cdef class KnowledgeBase:
 | 
				
			||||||
        exclude (Tuple[str]): Properties to exclude when restoring KB.
 | 
					        exclude (Tuple[str]): Properties to exclude when restoring KB.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        raise NotImplementedError(
 | 
					        raise NotImplementedError(
 | 
				
			||||||
            Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
 | 
					            Errors.E1045.format(
 | 
				
			||||||
 | 
					                parent="KnowledgeBase", method="from_bytes", name=self.__name__
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
 | 
					    def to_disk(
 | 
				
			||||||
 | 
					            self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
 | 
				
			||||||
 | 
					    ) -> None:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Write KnowledgeBase content to disk.
 | 
					        Write KnowledgeBase content to disk.
 | 
				
			||||||
        path (Union[str, Path]): Target file path.
 | 
					        path (Union[str, Path]): Target file path.
 | 
				
			||||||
        exclude (Iterable[str]): List of components to exclude.
 | 
					        exclude (Iterable[str]): List of components to exclude.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        raise NotImplementedError(
 | 
					        raise NotImplementedError(
 | 
				
			||||||
            Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
 | 
					            Errors.E1045.format(
 | 
				
			||||||
 | 
					                parent="KnowledgeBase", method="to_disk", name=self.__name__
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
 | 
					    def from_disk(
 | 
				
			||||||
 | 
					            self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
 | 
				
			||||||
 | 
					    ) -> None:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Load KnowledgeBase content from disk.
 | 
					        Load KnowledgeBase content from disk.
 | 
				
			||||||
        path (Union[str, Path]): Target file path.
 | 
					        path (Union[str, Path]): Target file path.
 | 
				
			||||||
        exclude (Iterable[str]): List of components to exclude.
 | 
					        exclude (Iterable[str]): List of components to exclude.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        raise NotImplementedError(
 | 
					        raise NotImplementedError(
 | 
				
			||||||
            Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
 | 
					            Errors.E1045.format(
 | 
				
			||||||
 | 
					                parent="KnowledgeBase", method="from_disk", name=self.__name__
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -55,23 +55,28 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
    # optional data, we can let users configure a DB as the backend for this.
 | 
					    # optional data, we can let users configure a DB as the backend for this.
 | 
				
			||||||
    cdef object _features_table
 | 
					    cdef object _features_table
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
 | 
					    cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
 | 
				
			||||||
        """Add an entity vector to the vectors table."""
 | 
					        """Add an entity vector to the vectors table."""
 | 
				
			||||||
        cdef int64_t new_index = self._vectors_table.size()
 | 
					        cdef int64_t new_index = self._vectors_table.size()
 | 
				
			||||||
        self._vectors_table.push_back(entity_vector)
 | 
					        self._vectors_table.push_back(entity_vector)
 | 
				
			||||||
        return new_index
 | 
					        return new_index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef inline int64_t c_add_entity(
 | 
				
			||||||
    cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
 | 
					        self,
 | 
				
			||||||
                                     int32_t vector_index, int feats_row) nogil:
 | 
					        hash_t entity_hash,
 | 
				
			||||||
 | 
					        float freq,
 | 
				
			||||||
 | 
					        int32_t vector_index,
 | 
				
			||||||
 | 
					        int feats_row
 | 
				
			||||||
 | 
					    ) nogil:
 | 
				
			||||||
        """Add an entry to the vector of entries.
 | 
					        """Add an entry to the vector of entries.
 | 
				
			||||||
        After calling this method, make sure to update also the _entry_index using the return value"""
 | 
					        After calling this method, make sure to update also the _entry_index
 | 
				
			||||||
 | 
					        using the return value"""
 | 
				
			||||||
        # This is what we'll map the entity hash key to. It's where the entry will sit
 | 
					        # This is what we'll map the entity hash key to. It's where the entry will sit
 | 
				
			||||||
        # in the vector of entries, so we can get it later.
 | 
					        # in the vector of entries, so we can get it later.
 | 
				
			||||||
        cdef int64_t new_index = self._entries.size()
 | 
					        cdef int64_t new_index = self._entries.size()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
 | 
					        # Avoid struct initializer to enable nogil, cf.
 | 
				
			||||||
 | 
					        # https://github.com/cython/cython/issues/1642
 | 
				
			||||||
        cdef KBEntryC entry
 | 
					        cdef KBEntryC entry
 | 
				
			||||||
        entry.entity_hash = entity_hash
 | 
					        entry.entity_hash = entity_hash
 | 
				
			||||||
        entry.vector_index = vector_index
 | 
					        entry.vector_index = vector_index
 | 
				
			||||||
| 
						 | 
					@ -81,11 +86,17 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
        self._entries.push_back(entry)
 | 
					        self._entries.push_back(entry)
 | 
				
			||||||
        return new_index
 | 
					        return new_index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil:
 | 
					    cdef inline int64_t c_add_aliases(
 | 
				
			||||||
        """Connect a mention to a list of potential entities with their prior probabilities .
 | 
					        self,
 | 
				
			||||||
        After calling this method, make sure to update also the _alias_index using the return value"""
 | 
					        hash_t alias_hash,
 | 
				
			||||||
        # This is what we'll map the alias hash key to. It's where the alias will be defined
 | 
					        vector[int64_t] entry_indices,
 | 
				
			||||||
        # in the vector of aliases.
 | 
					        vector[float] probs
 | 
				
			||||||
 | 
					    ) nogil:
 | 
				
			||||||
 | 
					        """Connect a mention to a list of potential entities with their prior
 | 
				
			||||||
 | 
					        probabilities. After calling this method, make sure to update also the
 | 
				
			||||||
 | 
					        _alias_index using the return value"""
 | 
				
			||||||
 | 
					        # This is what we'll map the alias hash key to. It's where the alias will be
 | 
				
			||||||
 | 
					        # defined in the vector of aliases.
 | 
				
			||||||
        cdef int64_t new_index = self._aliases_table.size()
 | 
					        cdef int64_t new_index = self._aliases_table.size()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Avoid struct initializer to enable nogil
 | 
					        # Avoid struct initializer to enable nogil
 | 
				
			||||||
| 
						 | 
					@ -98,8 +109,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
 | 
					    cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Initializing the vectors and making sure the first element of each vector is a dummy,
 | 
					        Initializing the vectors and making sure the first element of each vector is a
 | 
				
			||||||
        because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
 | 
					        dummy, because the PreshMap maps pointing to indices in these vectors can not
 | 
				
			||||||
 | 
					        contain 0 as value.
 | 
				
			||||||
        cf. https://github.com/explosion/preshed/issues/17
 | 
					        cf. https://github.com/explosion/preshed/issues/17
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        cdef int32_t dummy_value = 0
 | 
					        cdef int32_t dummy_value = 0
 | 
				
			||||||
| 
						 | 
					@ -130,12 +142,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
cdef class Writer:
 | 
					cdef class Writer:
 | 
				
			||||||
    cdef FILE* _fp
 | 
					    cdef FILE* _fp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
 | 
					    cdef int write_header(
 | 
				
			||||||
 | 
					        self, int64_t nr_entries, int64_t entity_vector_length
 | 
				
			||||||
 | 
					    ) except -1
 | 
				
			||||||
    cdef int write_vector_element(self, float element) except -1
 | 
					    cdef int write_vector_element(self, float element) except -1
 | 
				
			||||||
    cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
 | 
					    cdef int write_entry(
 | 
				
			||||||
 | 
					        self, hash_t entry_hash, float entry_freq, int32_t vector_index
 | 
				
			||||||
 | 
					    ) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int write_alias_length(self, int64_t alias_length) except -1
 | 
					    cdef int write_alias_length(self, int64_t alias_length) except -1
 | 
				
			||||||
    cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
 | 
					    cdef int write_alias_header(
 | 
				
			||||||
 | 
					        self, hash_t alias_hash, int64_t candidate_length
 | 
				
			||||||
 | 
					    ) except -1
 | 
				
			||||||
    cdef int write_alias(self, int64_t entry_index, float prob) except -1
 | 
					    cdef int write_alias(self, int64_t entry_index, float prob) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int _write(self, void* value, size_t size) except -1
 | 
					    cdef int _write(self, void* value, size_t size) except -1
 | 
				
			||||||
| 
						 | 
					@ -143,12 +161,18 @@ cdef class Writer:
 | 
				
			||||||
cdef class Reader:
 | 
					cdef class Reader:
 | 
				
			||||||
    cdef FILE* _fp
 | 
					    cdef FILE* _fp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
 | 
					    cdef int read_header(
 | 
				
			||||||
 | 
					        self, int64_t* nr_entries, int64_t* entity_vector_length
 | 
				
			||||||
 | 
					    ) except -1
 | 
				
			||||||
    cdef int read_vector_element(self, float* element) except -1
 | 
					    cdef int read_vector_element(self, float* element) except -1
 | 
				
			||||||
    cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
 | 
					    cdef int read_entry(
 | 
				
			||||||
 | 
					        self, hash_t* entity_hash, float* freq, int32_t* vector_index
 | 
				
			||||||
 | 
					    ) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int read_alias_length(self, int64_t* alias_length) except -1
 | 
					    cdef int read_alias_length(self, int64_t* alias_length) except -1
 | 
				
			||||||
    cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
 | 
					    cdef int read_alias_header(
 | 
				
			||||||
 | 
					        self, hash_t* alias_hash, int64_t* candidate_length
 | 
				
			||||||
 | 
					    ) except -1
 | 
				
			||||||
    cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
 | 
					    cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int _read(self, void* value, size_t size) except -1
 | 
					    cdef int _read(self, void* value, size_t size) except -1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,5 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True
 | 
					# cython: infer_types=True, profile=True
 | 
				
			||||||
from typing import Any, Callable, Dict, Iterable, Union
 | 
					from typing import Any, Callable, Dict, Iterable
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -27,8 +27,9 @@ from .candidate import Candidate as Candidate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class InMemoryLookupKB(KnowledgeBase):
 | 
					cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
    """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
 | 
					    """An `InMemoryLookupKB` instance stores unique identifiers for entities
 | 
				
			||||||
    to support entity linking of named entities to real-world concepts.
 | 
					    and their textual aliases, to support entity linking of named entities to
 | 
				
			||||||
 | 
					    real-world concepts.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DOCS: https://spacy.io/api/inmemorylookupkb
 | 
					    DOCS: https://spacy.io/api/inmemorylookupkb
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
| 
						 | 
					@ -71,7 +72,8 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def add_entity(self, str entity, float freq, vector[float] entity_vector):
 | 
					    def add_entity(self, str entity, float freq, vector[float] entity_vector):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Add an entity to the KB, optionally specifying its log probability based on corpus frequency
 | 
					        Add an entity to the KB, optionally specifying its log probability
 | 
				
			||||||
 | 
					        based on corpus frequency.
 | 
				
			||||||
        Return the hash of the entity ID/name at the end.
 | 
					        Return the hash of the entity ID/name at the end.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        cdef hash_t entity_hash = self.vocab.strings.add(entity)
 | 
					        cdef hash_t entity_hash = self.vocab.strings.add(entity)
 | 
				
			||||||
| 
						 | 
					@ -83,14 +85,20 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Raise an error if the provided entity vector is not of the correct length
 | 
					        # Raise an error if the provided entity vector is not of the correct length
 | 
				
			||||||
        if len(entity_vector) != self.entity_vector_length:
 | 
					        if len(entity_vector) != self.entity_vector_length:
 | 
				
			||||||
            raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
 | 
					            raise ValueError(
 | 
				
			||||||
 | 
					                Errors.E141.format(
 | 
				
			||||||
 | 
					                    found=len(entity_vector), required=self.entity_vector_length
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        vector_index = self.c_add_vector(entity_vector=entity_vector)
 | 
					        vector_index = self.c_add_vector(entity_vector=entity_vector)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        new_index = self.c_add_entity(entity_hash=entity_hash,
 | 
					        new_index = self.c_add_entity(
 | 
				
			||||||
 | 
					            entity_hash=entity_hash,
 | 
				
			||||||
            freq=freq,
 | 
					            freq=freq,
 | 
				
			||||||
            vector_index=vector_index,
 | 
					            vector_index=vector_index,
 | 
				
			||||||
                                      feats_row=-1)  # Features table currently not implemented
 | 
					            feats_row=-1
 | 
				
			||||||
 | 
					        )  # Features table currently not implemented
 | 
				
			||||||
        self._entry_index[entity_hash] = new_index
 | 
					        self._entry_index[entity_hash] = new_index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return entity_hash
 | 
					        return entity_hash
 | 
				
			||||||
| 
						 | 
					@ -115,7 +123,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                entity_vector = vector_list[i]
 | 
					                entity_vector = vector_list[i]
 | 
				
			||||||
                if len(entity_vector) != self.entity_vector_length:
 | 
					                if len(entity_vector) != self.entity_vector_length:
 | 
				
			||||||
                    raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
 | 
					                    raise ValueError(
 | 
				
			||||||
 | 
					                        Errors.E141.format(
 | 
				
			||||||
 | 
					                            found=len(entity_vector),
 | 
				
			||||||
 | 
					                            required=self.entity_vector_length
 | 
				
			||||||
 | 
					                        )
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                entry.entity_hash = entity_hash
 | 
					                entry.entity_hash = entity_hash
 | 
				
			||||||
                entry.freq = freq_list[i]
 | 
					                entry.freq = freq_list[i]
 | 
				
			||||||
| 
						 | 
					@ -149,11 +162,15 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
        previous_alias_nr = self.get_size_aliases()
 | 
					        previous_alias_nr = self.get_size_aliases()
 | 
				
			||||||
        # Throw an error if the length of entities and probabilities are not the same
 | 
					        # Throw an error if the length of entities and probabilities are not the same
 | 
				
			||||||
        if not len(entities) == len(probabilities):
 | 
					        if not len(entities) == len(probabilities):
 | 
				
			||||||
            raise ValueError(Errors.E132.format(alias=alias,
 | 
					            raise ValueError(
 | 
				
			||||||
 | 
					                Errors.E132.format(
 | 
				
			||||||
 | 
					                    alias=alias,
 | 
				
			||||||
                    entities_length=len(entities),
 | 
					                    entities_length=len(entities),
 | 
				
			||||||
                                                probabilities_length=len(probabilities)))
 | 
					                    probabilities_length=len(probabilities))
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors)
 | 
					        # Throw an error if the probabilities sum up to more than 1 (allow for
 | 
				
			||||||
 | 
					        # some rounding errors)
 | 
				
			||||||
        prob_sum = sum(probabilities)
 | 
					        prob_sum = sum(probabilities)
 | 
				
			||||||
        if prob_sum > 1.00001:
 | 
					        if prob_sum > 1.00001:
 | 
				
			||||||
            raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
 | 
					            raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
 | 
				
			||||||
| 
						 | 
					@ -170,40 +187,47 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for entity, prob in zip(entities, probabilities):
 | 
					        for entity, prob in zip(entities, probabilities):
 | 
				
			||||||
            entity_hash = self.vocab.strings[entity]
 | 
					            entity_hash = self.vocab.strings[entity]
 | 
				
			||||||
            if not entity_hash in self._entry_index:
 | 
					            if entity_hash not in self._entry_index:
 | 
				
			||||||
                raise ValueError(Errors.E134.format(entity=entity))
 | 
					                raise ValueError(Errors.E134.format(entity=entity))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            entry_index = <int64_t>self._entry_index.get(entity_hash)
 | 
					            entry_index = <int64_t>self._entry_index.get(entity_hash)
 | 
				
			||||||
            entry_indices.push_back(int(entry_index))
 | 
					            entry_indices.push_back(int(entry_index))
 | 
				
			||||||
            probs.push_back(float(prob))
 | 
					            probs.push_back(float(prob))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
 | 
					        new_index = self.c_add_aliases(
 | 
				
			||||||
 | 
					            alias_hash=alias_hash, entry_indices=entry_indices, probs=probs
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        self._alias_index[alias_hash] = new_index
 | 
					        self._alias_index[alias_hash] = new_index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if previous_alias_nr + 1 != self.get_size_aliases():
 | 
					        if previous_alias_nr + 1 != self.get_size_aliases():
 | 
				
			||||||
            raise RuntimeError(Errors.E891.format(alias=alias))
 | 
					            raise RuntimeError(Errors.E891.format(alias=alias))
 | 
				
			||||||
        return alias_hash
 | 
					        return alias_hash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
 | 
					    def append_alias(
 | 
				
			||||||
 | 
					        self, str alias, str entity, float prior_prob, ignore_warnings=False
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        For an alias already existing in the KB, extend its potential entities with one more.
 | 
					        For an alias already existing in the KB, extend its potential entities
 | 
				
			||||||
 | 
					        with one more.
 | 
				
			||||||
        Throw a warning if either the alias or the entity is unknown,
 | 
					        Throw a warning if either the alias or the entity is unknown,
 | 
				
			||||||
        or when the combination is already previously recorded.
 | 
					        or when the combination is already previously recorded.
 | 
				
			||||||
        Throw an error if this entity+prior prob would exceed the sum of 1.
 | 
					        Throw an error if this entity+prior prob would exceed the sum of 1.
 | 
				
			||||||
        For efficiency, it's best to use the method `add_alias` as much as possible instead of this one.
 | 
					        For efficiency, it's best to use the method `add_alias` as much as
 | 
				
			||||||
 | 
					        possible instead of this one.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        # Check if the alias exists in the KB
 | 
					        # Check if the alias exists in the KB
 | 
				
			||||||
        cdef hash_t alias_hash = self.vocab.strings[alias]
 | 
					        cdef hash_t alias_hash = self.vocab.strings[alias]
 | 
				
			||||||
        if not alias_hash in self._alias_index:
 | 
					        if alias_hash not in self._alias_index:
 | 
				
			||||||
            raise ValueError(Errors.E176.format(alias=alias))
 | 
					            raise ValueError(Errors.E176.format(alias=alias))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Check if the entity exists in the KB
 | 
					        # Check if the entity exists in the KB
 | 
				
			||||||
        cdef hash_t entity_hash = self.vocab.strings[entity]
 | 
					        cdef hash_t entity_hash = self.vocab.strings[entity]
 | 
				
			||||||
        if not entity_hash in self._entry_index:
 | 
					        if entity_hash not in self._entry_index:
 | 
				
			||||||
            raise ValueError(Errors.E134.format(entity=entity))
 | 
					            raise ValueError(Errors.E134.format(entity=entity))
 | 
				
			||||||
        entry_index = <int64_t>self._entry_index.get(entity_hash)
 | 
					        entry_index = <int64_t>self._entry_index.get(entity_hash)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Throw an error if the prior probabilities (including the new one) sum up to more than 1
 | 
					        # Throw an error if the prior probabilities (including the new one)
 | 
				
			||||||
 | 
					        # sum up to more than 1
 | 
				
			||||||
        alias_index = <int64_t>self._alias_index.get(alias_hash)
 | 
					        alias_index = <int64_t>self._alias_index.get(alias_hash)
 | 
				
			||||||
        alias_entry = self._aliases_table[alias_index]
 | 
					        alias_entry = self._aliases_table[alias_index]
 | 
				
			||||||
        current_sum = sum([p for p in alias_entry.probs])
 | 
					        current_sum = sum([p for p in alias_entry.probs])
 | 
				
			||||||
| 
						 | 
					@ -236,12 +260,13 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
 | 
					    def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Return candidate entities for an alias. Each candidate defines the entity, the original alias,
 | 
					        Return candidate entities for an alias. Each candidate defines the
 | 
				
			||||||
        and the prior probability of that alias resolving to that entity.
 | 
					        entity, the original alias, and the prior probability of that alias
 | 
				
			||||||
 | 
					        resolving to that entity.
 | 
				
			||||||
        If the alias is not known in the KB, and empty list is returned.
 | 
					        If the alias is not known in the KB, and empty list is returned.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        cdef hash_t alias_hash = self.vocab.strings[alias]
 | 
					        cdef hash_t alias_hash = self.vocab.strings[alias]
 | 
				
			||||||
        if not alias_hash in self._alias_index:
 | 
					        if alias_hash not in self._alias_index:
 | 
				
			||||||
            return []
 | 
					            return []
 | 
				
			||||||
        alias_index = <int64_t>self._alias_index.get(alias_hash)
 | 
					        alias_index = <int64_t>self._alias_index.get(alias_hash)
 | 
				
			||||||
        alias_entry = self._aliases_table[alias_index]
 | 
					        alias_entry = self._aliases_table[alias_index]
 | 
				
			||||||
| 
						 | 
					@ -249,10 +274,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
        return [Candidate(kb=self,
 | 
					        return [Candidate(kb=self,
 | 
				
			||||||
                          entity_hash=self._entries[entry_index].entity_hash,
 | 
					                          entity_hash=self._entries[entry_index].entity_hash,
 | 
				
			||||||
                          entity_freq=self._entries[entry_index].freq,
 | 
					                          entity_freq=self._entries[entry_index].freq,
 | 
				
			||||||
                          entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
 | 
					                          entity_vector=self._vectors_table[
 | 
				
			||||||
 | 
					                              self._entries[entry_index].vector_index
 | 
				
			||||||
 | 
					                          ],
 | 
				
			||||||
                          alias_hash=alias_hash,
 | 
					                          alias_hash=alias_hash,
 | 
				
			||||||
                          prior_prob=prior_prob)
 | 
					                          prior_prob=prior_prob)
 | 
				
			||||||
                for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
 | 
					                for (entry_index, prior_prob) in zip(
 | 
				
			||||||
 | 
					                    alias_entry.entry_indices, alias_entry.probs
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
                if entry_index != 0]
 | 
					                if entry_index != 0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_vector(self, str entity):
 | 
					    def get_vector(self, str entity):
 | 
				
			||||||
| 
						 | 
					@ -266,8 +295,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
        return self._vectors_table[self._entries[entry_index].vector_index]
 | 
					        return self._vectors_table[self._entries[entry_index].vector_index]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_prior_prob(self, str entity, str alias):
 | 
					    def get_prior_prob(self, str entity, str alias):
 | 
				
			||||||
        """ Return the prior probability of a given alias being linked to a given entity,
 | 
					        """ Return the prior probability of a given alias being linked to a
 | 
				
			||||||
        or return 0.0 when this combination is not known in the knowledge base"""
 | 
					        given entity, or return 0.0 when this combination is not known in the
 | 
				
			||||||
 | 
					        knowledge base."""
 | 
				
			||||||
        cdef hash_t alias_hash = self.vocab.strings[alias]
 | 
					        cdef hash_t alias_hash = self.vocab.strings[alias]
 | 
				
			||||||
        cdef hash_t entity_hash = self.vocab.strings[entity]
 | 
					        cdef hash_t entity_hash = self.vocab.strings[entity]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -278,7 +308,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
        entry_index = self._entry_index[entity_hash]
 | 
					        entry_index = self._entry_index[entity_hash]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        alias_entry = self._aliases_table[alias_index]
 | 
					        alias_entry = self._aliases_table[alias_index]
 | 
				
			||||||
        for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs):
 | 
					        for (entry_index, prior_prob) in zip(
 | 
				
			||||||
 | 
					            alias_entry.entry_indices, alias_entry.probs
 | 
				
			||||||
 | 
					        ):
 | 
				
			||||||
            if self._entries[entry_index].entity_hash == entity_hash:
 | 
					            if self._entries[entry_index].entity_hash == entity_hash:
 | 
				
			||||||
                return prior_prob
 | 
					                return prior_prob
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -288,13 +320,19 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
        """Serialize the current state to a binary string.
 | 
					        """Serialize the current state to a binary string.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        def serialize_header():
 | 
					        def serialize_header():
 | 
				
			||||||
            header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length)
 | 
					            header = (
 | 
				
			||||||
 | 
					                self.get_size_entities(),
 | 
				
			||||||
 | 
					                self.get_size_aliases(),
 | 
				
			||||||
 | 
					                self.entity_vector_length
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
            return srsly.json_dumps(header)
 | 
					            return srsly.json_dumps(header)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def serialize_entries():
 | 
					        def serialize_entries():
 | 
				
			||||||
            i = 1
 | 
					            i = 1
 | 
				
			||||||
            tuples = []
 | 
					            tuples = []
 | 
				
			||||||
            for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
 | 
					            for entry_hash, entry_index in sorted(
 | 
				
			||||||
 | 
					                self._entry_index.items(), key=lambda x: x[1]
 | 
				
			||||||
 | 
					            ):
 | 
				
			||||||
                entry = self._entries[entry_index]
 | 
					                entry = self._entries[entry_index]
 | 
				
			||||||
                assert entry.entity_hash == entry_hash
 | 
					                assert entry.entity_hash == entry_hash
 | 
				
			||||||
                assert entry_index == i
 | 
					                assert entry_index == i
 | 
				
			||||||
| 
						 | 
					@ -307,7 +345,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
            headers = []
 | 
					            headers = []
 | 
				
			||||||
            indices_lists = []
 | 
					            indices_lists = []
 | 
				
			||||||
            probs_lists = []
 | 
					            probs_lists = []
 | 
				
			||||||
            for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
 | 
					            for alias_hash, alias_index in sorted(
 | 
				
			||||||
 | 
					                self._alias_index.items(), key=lambda x: x[1]
 | 
				
			||||||
 | 
					            ):
 | 
				
			||||||
                alias = self._aliases_table[alias_index]
 | 
					                alias = self._aliases_table[alias_index]
 | 
				
			||||||
                assert alias_index == i
 | 
					                assert alias_index == i
 | 
				
			||||||
                candidate_length = len(alias.entry_indices)
 | 
					                candidate_length = len(alias.entry_indices)
 | 
				
			||||||
| 
						 | 
					@ -365,7 +405,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
            indices = srsly.json_loads(all_data[1])
 | 
					            indices = srsly.json_loads(all_data[1])
 | 
				
			||||||
            probs = srsly.json_loads(all_data[2])
 | 
					            probs = srsly.json_loads(all_data[2])
 | 
				
			||||||
            for header, indices, probs in zip(headers, indices, probs):
 | 
					            for header, indices, probs in zip(headers, indices, probs):
 | 
				
			||||||
                alias_hash, candidate_length = header
 | 
					                alias_hash, _candidate_length = header
 | 
				
			||||||
                alias.entry_indices = indices
 | 
					                alias.entry_indices = indices
 | 
				
			||||||
                alias.probs = probs
 | 
					                alias.probs = probs
 | 
				
			||||||
                self._aliases_table[i] = alias
 | 
					                self._aliases_table[i] = alias
 | 
				
			||||||
| 
						 | 
					@ -414,10 +454,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
                writer.write_vector_element(element)
 | 
					                writer.write_vector_element(element)
 | 
				
			||||||
            i = i+1
 | 
					            i = i+1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # dumping the entry records in the order in which they are in the _entries vector.
 | 
					        # dumping the entry records in the order in which they are in the
 | 
				
			||||||
        # index 0 is a dummy object not stored in the _entry_index and can be ignored.
 | 
					        # _entries vector.
 | 
				
			||||||
 | 
					        # index 0 is a dummy object not stored in the _entry_index and can
 | 
				
			||||||
 | 
					        # be ignored.
 | 
				
			||||||
        i = 1
 | 
					        i = 1
 | 
				
			||||||
        for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
 | 
					        for entry_hash, entry_index in sorted(
 | 
				
			||||||
 | 
					            self._entry_index.items(), key=lambda x: x[1]
 | 
				
			||||||
 | 
					        ):
 | 
				
			||||||
            entry = self._entries[entry_index]
 | 
					            entry = self._entries[entry_index]
 | 
				
			||||||
            assert entry.entity_hash == entry_hash
 | 
					            assert entry.entity_hash == entry_hash
 | 
				
			||||||
            assert entry_index == i
 | 
					            assert entry_index == i
 | 
				
			||||||
| 
						 | 
					@ -429,7 +473,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
        # dumping the aliases in the order in which they are in the _alias_index vector.
 | 
					        # dumping the aliases in the order in which they are in the _alias_index vector.
 | 
				
			||||||
        # index 0 is a dummy object not stored in the _aliases_table and can be ignored.
 | 
					        # index 0 is a dummy object not stored in the _aliases_table and can be ignored.
 | 
				
			||||||
        i = 1
 | 
					        i = 1
 | 
				
			||||||
        for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
 | 
					        for alias_hash, alias_index in sorted(
 | 
				
			||||||
 | 
					                self._alias_index.items(), key=lambda x: x[1]
 | 
				
			||||||
 | 
					        ):
 | 
				
			||||||
            alias = self._aliases_table[alias_index]
 | 
					            alias = self._aliases_table[alias_index]
 | 
				
			||||||
            assert alias_index == i
 | 
					            assert alias_index == i
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -535,7 +581,8 @@ cdef class Writer:
 | 
				
			||||||
    def __init__(self, path):
 | 
					    def __init__(self, path):
 | 
				
			||||||
        assert isinstance(path, Path)
 | 
					        assert isinstance(path, Path)
 | 
				
			||||||
        content = bytes(path)
 | 
					        content = bytes(path)
 | 
				
			||||||
        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
 | 
					        cdef bytes bytes_loc = content.encode('utf8') \
 | 
				
			||||||
 | 
					            if type(content) == str else content
 | 
				
			||||||
        self._fp = fopen(<char*>bytes_loc, 'wb')
 | 
					        self._fp = fopen(<char*>bytes_loc, 'wb')
 | 
				
			||||||
        if not self._fp:
 | 
					        if not self._fp:
 | 
				
			||||||
            raise IOError(Errors.E146.format(path=path))
 | 
					            raise IOError(Errors.E146.format(path=path))
 | 
				
			||||||
| 
						 | 
					@ -545,14 +592,18 @@ cdef class Writer:
 | 
				
			||||||
        cdef size_t status = fclose(self._fp)
 | 
					        cdef size_t status = fclose(self._fp)
 | 
				
			||||||
        assert status == 0
 | 
					        assert status == 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1:
 | 
					    cdef int write_header(
 | 
				
			||||||
 | 
					        self, int64_t nr_entries, int64_t entity_vector_length
 | 
				
			||||||
 | 
					    ) except -1:
 | 
				
			||||||
        self._write(&nr_entries, sizeof(nr_entries))
 | 
					        self._write(&nr_entries, sizeof(nr_entries))
 | 
				
			||||||
        self._write(&entity_vector_length, sizeof(entity_vector_length))
 | 
					        self._write(&entity_vector_length, sizeof(entity_vector_length))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int write_vector_element(self, float element) except -1:
 | 
					    cdef int write_vector_element(self, float element) except -1:
 | 
				
			||||||
        self._write(&element, sizeof(element))
 | 
					        self._write(&element, sizeof(element))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
 | 
					    cdef int write_entry(
 | 
				
			||||||
 | 
					        self, hash_t entry_hash, float entry_freq, int32_t vector_index
 | 
				
			||||||
 | 
					    ) except -1:
 | 
				
			||||||
        self._write(&entry_hash, sizeof(entry_hash))
 | 
					        self._write(&entry_hash, sizeof(entry_hash))
 | 
				
			||||||
        self._write(&entry_freq, sizeof(entry_freq))
 | 
					        self._write(&entry_freq, sizeof(entry_freq))
 | 
				
			||||||
        self._write(&vector_index, sizeof(vector_index))
 | 
					        self._write(&vector_index, sizeof(vector_index))
 | 
				
			||||||
| 
						 | 
					@ -561,7 +612,9 @@ cdef class Writer:
 | 
				
			||||||
    cdef int write_alias_length(self, int64_t alias_length) except -1:
 | 
					    cdef int write_alias_length(self, int64_t alias_length) except -1:
 | 
				
			||||||
        self._write(&alias_length, sizeof(alias_length))
 | 
					        self._write(&alias_length, sizeof(alias_length))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1:
 | 
					    cdef int write_alias_header(
 | 
				
			||||||
 | 
					        self, hash_t alias_hash, int64_t candidate_length
 | 
				
			||||||
 | 
					    ) except -1:
 | 
				
			||||||
        self._write(&alias_hash, sizeof(alias_hash))
 | 
					        self._write(&alias_hash, sizeof(alias_hash))
 | 
				
			||||||
        self._write(&candidate_length, sizeof(candidate_length))
 | 
					        self._write(&candidate_length, sizeof(candidate_length))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -577,16 +630,19 @@ cdef class Writer:
 | 
				
			||||||
cdef class Reader:
 | 
					cdef class Reader:
 | 
				
			||||||
    def __init__(self, path):
 | 
					    def __init__(self, path):
 | 
				
			||||||
        content = bytes(path)
 | 
					        content = bytes(path)
 | 
				
			||||||
        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
 | 
					        cdef bytes bytes_loc = content.encode('utf8') \
 | 
				
			||||||
 | 
					            if type(content) == str else content
 | 
				
			||||||
        self._fp = fopen(<char*>bytes_loc, 'rb')
 | 
					        self._fp = fopen(<char*>bytes_loc, 'rb')
 | 
				
			||||||
        if not self._fp:
 | 
					        if not self._fp:
 | 
				
			||||||
            PyErr_SetFromErrno(IOError)
 | 
					            PyErr_SetFromErrno(IOError)
 | 
				
			||||||
        status = fseek(self._fp, 0, 0)  # this can be 0 if there is no header
 | 
					        fseek(self._fp, 0, 0)  # this can be 0 if there is no header
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __dealloc__(self):
 | 
					    def __dealloc__(self):
 | 
				
			||||||
        fclose(self._fp)
 | 
					        fclose(self._fp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1:
 | 
					    cdef int read_header(
 | 
				
			||||||
 | 
					        self, int64_t* nr_entries, int64_t* entity_vector_length
 | 
				
			||||||
 | 
					    ) except -1:
 | 
				
			||||||
        status = self._read(nr_entries, sizeof(int64_t))
 | 
					        status = self._read(nr_entries, sizeof(int64_t))
 | 
				
			||||||
        if status < 1:
 | 
					        if status < 1:
 | 
				
			||||||
            if feof(self._fp):
 | 
					            if feof(self._fp):
 | 
				
			||||||
| 
						 | 
					@ -606,7 +662,9 @@ cdef class Reader:
 | 
				
			||||||
                return 0  # end of file
 | 
					                return 0  # end of file
 | 
				
			||||||
            raise IOError(Errors.E145.format(param="vector element"))
 | 
					            raise IOError(Errors.E145.format(param="vector element"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
 | 
					    cdef int read_entry(
 | 
				
			||||||
 | 
					        self, hash_t* entity_hash, float* freq, int32_t* vector_index
 | 
				
			||||||
 | 
					    ) except -1:
 | 
				
			||||||
        status = self._read(entity_hash, sizeof(hash_t))
 | 
					        status = self._read(entity_hash, sizeof(hash_t))
 | 
				
			||||||
        if status < 1:
 | 
					        if status < 1:
 | 
				
			||||||
            if feof(self._fp):
 | 
					            if feof(self._fp):
 | 
				
			||||||
| 
						 | 
					@ -637,7 +695,9 @@ cdef class Reader:
 | 
				
			||||||
                return 0  # end of file
 | 
					                return 0  # end of file
 | 
				
			||||||
            raise IOError(Errors.E145.format(param="alias length"))
 | 
					            raise IOError(Errors.E145.format(param="alias length"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1:
 | 
					    cdef int read_alias_header(
 | 
				
			||||||
 | 
					        self, hash_t* alias_hash, int64_t* candidate_length
 | 
				
			||||||
 | 
					    ) except -1:
 | 
				
			||||||
        status = self._read(alias_hash, sizeof(hash_t))
 | 
					        status = self._read(alias_hash, sizeof(hash_t))
 | 
				
			||||||
        if status < 1:
 | 
					        if status < 1:
 | 
				
			||||||
            if feof(self._fp):
 | 
					            if feof(self._fp):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1826,7 +1826,6 @@ class Language:
 | 
				
			||||||
        # Later we replace the component config with the raw config again.
 | 
					        # Later we replace the component config with the raw config again.
 | 
				
			||||||
        interpolated = filled.interpolate() if not filled.is_interpolated else filled
 | 
					        interpolated = filled.interpolate() if not filled.is_interpolated else filled
 | 
				
			||||||
        pipeline = interpolated.get("components", {})
 | 
					        pipeline = interpolated.get("components", {})
 | 
				
			||||||
        sourced = util.get_sourced_components(interpolated)
 | 
					 | 
				
			||||||
        # If components are loaded from a source (existing models), we cache
 | 
					        # If components are loaded from a source (existing models), we cache
 | 
				
			||||||
        # them here so they're only loaded once
 | 
					        # them here so they're only loaded once
 | 
				
			||||||
        source_nlps = {}
 | 
					        source_nlps = {}
 | 
				
			||||||
| 
						 | 
					@ -1959,7 +1958,7 @@ class Language:
 | 
				
			||||||
        useful when training a pipeline with components sourced from an existing
 | 
					        useful when training a pipeline with components sourced from an existing
 | 
				
			||||||
        pipeline: if multiple components (e.g. tagger, parser, NER) listen to
 | 
					        pipeline: if multiple components (e.g. tagger, parser, NER) listen to
 | 
				
			||||||
        the same tok2vec component, but some of them are frozen and not updated,
 | 
					        the same tok2vec component, but some of them are frozen and not updated,
 | 
				
			||||||
        their performance may degrade significally as the tok2vec component is
 | 
					        their performance may degrade significantly as the tok2vec component is
 | 
				
			||||||
        updated with new data. To prevent this, listeners can be replaced with
 | 
					        updated with new data. To prevent this, listeners can be replaced with
 | 
				
			||||||
        a standalone tok2vec layer that is owned by the component and doesn't
 | 
					        a standalone tok2vec layer that is owned by the component and doesn't
 | 
				
			||||||
        change if the component isn't updated.
 | 
					        change if the component isn't updated.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,6 @@
 | 
				
			||||||
# cython: embedsignature=True
 | 
					# cython: embedsignature=True
 | 
				
			||||||
# Compiler crashes on memory view coercion without this. Should report bug.
 | 
					# Compiler crashes on memory view coercion without this. Should report bug.
 | 
				
			||||||
cimport numpy as np
 | 
					cimport numpy as np
 | 
				
			||||||
from cython.view cimport array as cvarray
 | 
					 | 
				
			||||||
from libc.string cimport memset
 | 
					from libc.string cimport memset
 | 
				
			||||||
 | 
					
 | 
				
			||||||
np.import_array()
 | 
					np.import_array()
 | 
				
			||||||
| 
						 | 
					@ -137,9 +136,11 @@ cdef class Lexeme:
 | 
				
			||||||
        if hasattr(other, "orth"):
 | 
					        if hasattr(other, "orth"):
 | 
				
			||||||
            if self.c.orth == other.orth:
 | 
					            if self.c.orth == other.orth:
 | 
				
			||||||
                return 1.0
 | 
					                return 1.0
 | 
				
			||||||
        elif hasattr(other, "__len__") and len(other) == 1 \
 | 
					        elif (
 | 
				
			||||||
        and hasattr(other[0], "orth"):
 | 
					            hasattr(other, "__len__") and len(other) == 1
 | 
				
			||||||
            if self.c.orth == other[0].orth:
 | 
					            and hasattr(other[0], "orth")
 | 
				
			||||||
 | 
					            and self.c.orth == other[0].orth
 | 
				
			||||||
 | 
					        ):
 | 
				
			||||||
            return 1.0
 | 
					            return 1.0
 | 
				
			||||||
        if self.vector_norm == 0 or other.vector_norm == 0:
 | 
					        if self.vector_norm == 0 or other.vector_norm == 0:
 | 
				
			||||||
            warnings.warn(Warnings.W008.format(obj="Lexeme"))
 | 
					            warnings.warn(Warnings.W008.format(obj="Lexeme"))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -108,7 +108,7 @@ cdef class DependencyMatcher:
 | 
				
			||||||
        key (str): The match ID.
 | 
					        key (str): The match ID.
 | 
				
			||||||
        RETURNS (bool): Whether the matcher contains rules for this match ID.
 | 
					        RETURNS (bool): Whether the matcher contains rules for this match ID.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        return self.has_key(key)
 | 
					        return self.has_key(key)  # no-cython-lint: W601
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _validate_input(self, pattern, key):
 | 
					    def _validate_input(self, pattern, key):
 | 
				
			||||||
        idx = 0
 | 
					        idx = 0
 | 
				
			||||||
| 
						 | 
					@ -264,7 +264,7 @@ cdef class DependencyMatcher:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def remove(self, key):
 | 
					    def remove(self, key):
 | 
				
			||||||
        key = self._normalize_key(key)
 | 
					        key = self._normalize_key(key)
 | 
				
			||||||
        if not key in self._patterns:
 | 
					        if key not in self._patterns:
 | 
				
			||||||
            raise ValueError(Errors.E175.format(key=key))
 | 
					            raise ValueError(Errors.E175.format(key=key))
 | 
				
			||||||
        self._patterns.pop(key)
 | 
					        self._patterns.pop(key)
 | 
				
			||||||
        self._raw_patterns.pop(key)
 | 
					        self._raw_patterns.pop(key)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,31 +12,18 @@ import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..attrs cimport (
 | 
					from ..attrs cimport DEP, ENT_IOB, ID, LEMMA, MORPH, NULL_ATTR, POS, TAG
 | 
				
			||||||
    DEP,
 | 
					 | 
				
			||||||
    ENT_IOB,
 | 
					 | 
				
			||||||
    ID,
 | 
					 | 
				
			||||||
    LEMMA,
 | 
					 | 
				
			||||||
    MORPH,
 | 
					 | 
				
			||||||
    NULL_ATTR,
 | 
					 | 
				
			||||||
    ORTH,
 | 
					 | 
				
			||||||
    POS,
 | 
					 | 
				
			||||||
    TAG,
 | 
					 | 
				
			||||||
    attr_id_t,
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
from ..structs cimport TokenC
 | 
					from ..structs cimport TokenC
 | 
				
			||||||
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 | 
					from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 | 
				
			||||||
from ..tokens.morphanalysis cimport MorphAnalysis
 | 
					from ..tokens.morphanalysis cimport MorphAnalysis
 | 
				
			||||||
from ..tokens.span cimport Span
 | 
					from ..tokens.span cimport Span
 | 
				
			||||||
from ..tokens.token cimport Token
 | 
					from ..tokens.token cimport Token
 | 
				
			||||||
from ..typedefs cimport attr_t
 | 
					from ..typedefs cimport attr_t
 | 
				
			||||||
from ..vocab cimport Vocab
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..attrs import IDS
 | 
					from ..attrs import IDS
 | 
				
			||||||
from ..errors import Errors, MatchPatternError, Warnings
 | 
					from ..errors import Errors, MatchPatternError, Warnings
 | 
				
			||||||
from ..schemas import validate_token_pattern
 | 
					from ..schemas import validate_token_pattern
 | 
				
			||||||
from ..strings import get_string_id
 | 
					from ..strings import get_string_id
 | 
				
			||||||
from ..util import registry
 | 
					 | 
				
			||||||
from .levenshtein import levenshtein_compare
 | 
					from .levenshtein import levenshtein_compare
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEF PADDING = 5
 | 
					DEF PADDING = 5
 | 
				
			||||||
| 
						 | 
					@ -87,7 +74,7 @@ cdef class Matcher:
 | 
				
			||||||
        key (str): The match ID.
 | 
					        key (str): The match ID.
 | 
				
			||||||
        RETURNS (bool): Whether the matcher contains rules for this match ID.
 | 
					        RETURNS (bool): Whether the matcher contains rules for this match ID.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        return self.has_key(key)
 | 
					        return self.has_key(key)  # no-cython-lint: W601
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def add(self, key, patterns, *, on_match=None, greedy: str = None):
 | 
					    def add(self, key, patterns, *, on_match=None, greedy: str = None):
 | 
				
			||||||
        """Add a match-rule to the matcher. A match-rule consists of: an ID
 | 
					        """Add a match-rule to the matcher. A match-rule consists of: an ID
 | 
				
			||||||
| 
						 | 
					@ -143,8 +130,13 @@ cdef class Matcher:
 | 
				
			||||||
        key = self._normalize_key(key)
 | 
					        key = self._normalize_key(key)
 | 
				
			||||||
        for pattern in patterns:
 | 
					        for pattern in patterns:
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                specs = _preprocess_pattern(pattern, self.vocab,
 | 
					                specs = _preprocess_pattern(
 | 
				
			||||||
                    self._extensions, self._extra_predicates, self._fuzzy_compare)
 | 
					                    pattern,
 | 
				
			||||||
 | 
					                    self.vocab,
 | 
				
			||||||
 | 
					                    self._extensions,
 | 
				
			||||||
 | 
					                    self._extra_predicates,
 | 
				
			||||||
 | 
					                    self._fuzzy_compare
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
                self.patterns.push_back(init_pattern(self.mem, key, specs))
 | 
					                self.patterns.push_back(init_pattern(self.mem, key, specs))
 | 
				
			||||||
                for spec in specs:
 | 
					                for spec in specs:
 | 
				
			||||||
                    for attr, _ in spec[1]:
 | 
					                    for attr, _ in spec[1]:
 | 
				
			||||||
| 
						 | 
					@ -168,7 +160,7 @@ cdef class Matcher:
 | 
				
			||||||
        key (str): The ID of the match rule.
 | 
					        key (str): The ID of the match rule.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        norm_key = self._normalize_key(key)
 | 
					        norm_key = self._normalize_key(key)
 | 
				
			||||||
        if not norm_key in self._patterns:
 | 
					        if norm_key not in self._patterns:
 | 
				
			||||||
            raise ValueError(Errors.E175.format(key=key))
 | 
					            raise ValueError(Errors.E175.format(key=key))
 | 
				
			||||||
        self._patterns.pop(norm_key)
 | 
					        self._patterns.pop(norm_key)
 | 
				
			||||||
        self._callbacks.pop(norm_key)
 | 
					        self._callbacks.pop(norm_key)
 | 
				
			||||||
| 
						 | 
					@ -268,8 +260,15 @@ cdef class Matcher:
 | 
				
			||||||
        if self.patterns.empty():
 | 
					        if self.patterns.empty():
 | 
				
			||||||
            matches = []
 | 
					            matches = []
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
 | 
					            matches = find_matches(
 | 
				
			||||||
                                    extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
 | 
					                &self.patterns[0],
 | 
				
			||||||
 | 
					                self.patterns.size(),
 | 
				
			||||||
 | 
					                doclike,
 | 
				
			||||||
 | 
					                length,
 | 
				
			||||||
 | 
					                extensions=self._extensions,
 | 
				
			||||||
 | 
					                predicates=self._extra_predicates,
 | 
				
			||||||
 | 
					                with_alignments=with_alignments
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        final_matches = []
 | 
					        final_matches = []
 | 
				
			||||||
        pairs_by_id = {}
 | 
					        pairs_by_id = {}
 | 
				
			||||||
        # For each key, either add all matches, or only the filtered,
 | 
					        # For each key, either add all matches, or only the filtered,
 | 
				
			||||||
| 
						 | 
					@ -366,7 +365,6 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
 | 
				
			||||||
    cdef vector[MatchC] matches
 | 
					    cdef vector[MatchC] matches
 | 
				
			||||||
    cdef vector[vector[MatchAlignmentC]] align_states
 | 
					    cdef vector[vector[MatchAlignmentC]] align_states
 | 
				
			||||||
    cdef vector[vector[MatchAlignmentC]] align_matches
 | 
					    cdef vector[vector[MatchAlignmentC]] align_matches
 | 
				
			||||||
    cdef PatternStateC state
 | 
					 | 
				
			||||||
    cdef int i, j, nr_extra_attr
 | 
					    cdef int i, j, nr_extra_attr
 | 
				
			||||||
    cdef Pool mem = Pool()
 | 
					    cdef Pool mem = Pool()
 | 
				
			||||||
    output = []
 | 
					    output = []
 | 
				
			||||||
| 
						 | 
					@ -388,14 +386,22 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
 | 
				
			||||||
                value = token.vocab.strings[value]
 | 
					                value = token.vocab.strings[value]
 | 
				
			||||||
            extra_attr_values[i * nr_extra_attr + index] = value
 | 
					            extra_attr_values[i * nr_extra_attr + index] = value
 | 
				
			||||||
    # Main loop
 | 
					    # Main loop
 | 
				
			||||||
    cdef int nr_predicate = len(predicates)
 | 
					 | 
				
			||||||
    for i in range(length):
 | 
					    for i in range(length):
 | 
				
			||||||
        for j in range(n):
 | 
					        for j in range(n):
 | 
				
			||||||
            states.push_back(PatternStateC(patterns[j], i, 0))
 | 
					            states.push_back(PatternStateC(patterns[j], i, 0))
 | 
				
			||||||
        if with_alignments != 0:
 | 
					        if with_alignments != 0:
 | 
				
			||||||
            align_states.resize(states.size())
 | 
					            align_states.resize(states.size())
 | 
				
			||||||
        transition_states(states, matches, align_states, align_matches, predicate_cache,
 | 
					        transition_states(
 | 
				
			||||||
            doclike[i], extra_attr_values, predicates, with_alignments)
 | 
					            states,
 | 
				
			||||||
 | 
					            matches,
 | 
				
			||||||
 | 
					            align_states,
 | 
				
			||||||
 | 
					            align_matches,
 | 
				
			||||||
 | 
					            predicate_cache,
 | 
				
			||||||
 | 
					            doclike[i],
 | 
				
			||||||
 | 
					            extra_attr_values,
 | 
				
			||||||
 | 
					            predicates,
 | 
				
			||||||
 | 
					            with_alignments
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        extra_attr_values += nr_extra_attr
 | 
					        extra_attr_values += nr_extra_attr
 | 
				
			||||||
        predicate_cache += len(predicates)
 | 
					        predicate_cache += len(predicates)
 | 
				
			||||||
    # Handle matches that end in 0-width patterns
 | 
					    # Handle matches that end in 0-width patterns
 | 
				
			||||||
| 
						 | 
					@ -421,18 +427,28 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
 | 
				
			||||||
    return output
 | 
					    return output
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
 | 
					cdef void transition_states(
 | 
				
			||||||
                            vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches,
 | 
					    vector[PatternStateC]& states,
 | 
				
			||||||
 | 
					    vector[MatchC]& matches,
 | 
				
			||||||
 | 
					    vector[vector[MatchAlignmentC]]& align_states,
 | 
				
			||||||
 | 
					    vector[vector[MatchAlignmentC]]& align_matches,
 | 
				
			||||||
    int8_t* cached_py_predicates,
 | 
					    int8_t* cached_py_predicates,
 | 
				
			||||||
        Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *:
 | 
					    Token token,
 | 
				
			||||||
 | 
					    const attr_t* extra_attrs,
 | 
				
			||||||
 | 
					    py_predicates,
 | 
				
			||||||
 | 
					    bint with_alignments
 | 
				
			||||||
 | 
					) except *:
 | 
				
			||||||
    cdef int q = 0
 | 
					    cdef int q = 0
 | 
				
			||||||
    cdef vector[PatternStateC] new_states
 | 
					    cdef vector[PatternStateC] new_states
 | 
				
			||||||
    cdef vector[vector[MatchAlignmentC]] align_new_states
 | 
					    cdef vector[vector[MatchAlignmentC]] align_new_states
 | 
				
			||||||
    cdef int nr_predicate = len(py_predicates)
 | 
					 | 
				
			||||||
    for i in range(states.size()):
 | 
					    for i in range(states.size()):
 | 
				
			||||||
        if states[i].pattern.nr_py >= 1:
 | 
					        if states[i].pattern.nr_py >= 1:
 | 
				
			||||||
            update_predicate_cache(cached_py_predicates,
 | 
					            update_predicate_cache(
 | 
				
			||||||
                states[i].pattern, token, py_predicates)
 | 
					                cached_py_predicates,
 | 
				
			||||||
 | 
					                states[i].pattern,
 | 
				
			||||||
 | 
					                token,
 | 
				
			||||||
 | 
					                py_predicates
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        action = get_action(states[i], token.c, extra_attrs,
 | 
					        action = get_action(states[i], token.c, extra_attrs,
 | 
				
			||||||
                            cached_py_predicates)
 | 
					                            cached_py_predicates)
 | 
				
			||||||
        if action == REJECT:
 | 
					        if action == REJECT:
 | 
				
			||||||
| 
						 | 
					@ -468,8 +484,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
 | 
				
			||||||
                    align_new_states.push_back(align_states[q])
 | 
					                    align_new_states.push_back(align_states[q])
 | 
				
			||||||
            states[q].pattern += 1
 | 
					            states[q].pattern += 1
 | 
				
			||||||
            if states[q].pattern.nr_py != 0:
 | 
					            if states[q].pattern.nr_py != 0:
 | 
				
			||||||
                update_predicate_cache(cached_py_predicates,
 | 
					                update_predicate_cache(
 | 
				
			||||||
                    states[q].pattern, token, py_predicates)
 | 
					                    cached_py_predicates,
 | 
				
			||||||
 | 
					                    states[q].pattern,
 | 
				
			||||||
 | 
					                    token,
 | 
				
			||||||
 | 
					                    py_predicates
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
            action = get_action(states[q], token.c, extra_attrs,
 | 
					            action = get_action(states[q], token.c, extra_attrs,
 | 
				
			||||||
                                cached_py_predicates)
 | 
					                                cached_py_predicates)
 | 
				
			||||||
        # Update alignment before the transition of current state
 | 
					        # Update alignment before the transition of current state
 | 
				
			||||||
| 
						 | 
					@ -485,8 +505,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
 | 
				
			||||||
            ent_id = get_ent_id(state.pattern)
 | 
					            ent_id = get_ent_id(state.pattern)
 | 
				
			||||||
            if action == MATCH:
 | 
					            if action == MATCH:
 | 
				
			||||||
                matches.push_back(
 | 
					                matches.push_back(
 | 
				
			||||||
                    MatchC(pattern_id=ent_id, start=state.start,
 | 
					                    MatchC(
 | 
				
			||||||
                            length=state.length+1))
 | 
					                        pattern_id=ent_id,
 | 
				
			||||||
 | 
					                        start=state.start,
 | 
				
			||||||
 | 
					                        length=state.length+1
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
                # `align_matches` always corresponds to `matches` 1:1
 | 
					                # `align_matches` always corresponds to `matches` 1:1
 | 
				
			||||||
                if with_alignments != 0:
 | 
					                if with_alignments != 0:
 | 
				
			||||||
                    align_matches.push_back(align_states[q])
 | 
					                    align_matches.push_back(align_states[q])
 | 
				
			||||||
| 
						 | 
					@ -494,23 +518,35 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
 | 
				
			||||||
                # push match without last token if length > 0
 | 
					                # push match without last token if length > 0
 | 
				
			||||||
                if state.length > 0:
 | 
					                if state.length > 0:
 | 
				
			||||||
                    matches.push_back(
 | 
					                    matches.push_back(
 | 
				
			||||||
                        MatchC(pattern_id=ent_id, start=state.start,
 | 
					                        MatchC(
 | 
				
			||||||
                                length=state.length))
 | 
					                            pattern_id=ent_id,
 | 
				
			||||||
 | 
					                            start=state.start,
 | 
				
			||||||
 | 
					                            length=state.length
 | 
				
			||||||
 | 
					                        )
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
                    # MATCH_DOUBLE emits matches twice,
 | 
					                    # MATCH_DOUBLE emits matches twice,
 | 
				
			||||||
                    # add one more to align_matches in order to keep 1:1 relationship
 | 
					                    # add one more to align_matches in order to keep 1:1 relationship
 | 
				
			||||||
                    if with_alignments != 0:
 | 
					                    if with_alignments != 0:
 | 
				
			||||||
                        align_matches.push_back(align_states[q])
 | 
					                        align_matches.push_back(align_states[q])
 | 
				
			||||||
                # push match with last token
 | 
					                # push match with last token
 | 
				
			||||||
                matches.push_back(
 | 
					                matches.push_back(
 | 
				
			||||||
                    MatchC(pattern_id=ent_id, start=state.start,
 | 
					                    MatchC(
 | 
				
			||||||
                            length=state.length+1))
 | 
					                        pattern_id=ent_id,
 | 
				
			||||||
 | 
					                        start=state.start,
 | 
				
			||||||
 | 
					                        length=state.length + 1
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
                # `align_matches` always corresponds to `matches` 1:1
 | 
					                # `align_matches` always corresponds to `matches` 1:1
 | 
				
			||||||
                if with_alignments != 0:
 | 
					                if with_alignments != 0:
 | 
				
			||||||
                    align_matches.push_back(align_states[q])
 | 
					                    align_matches.push_back(align_states[q])
 | 
				
			||||||
            elif action == MATCH_REJECT:
 | 
					            elif action == MATCH_REJECT:
 | 
				
			||||||
                matches.push_back(
 | 
					                matches.push_back(
 | 
				
			||||||
                    MatchC(pattern_id=ent_id, start=state.start,
 | 
					                    MatchC(
 | 
				
			||||||
                            length=state.length))
 | 
					                        pattern_id=ent_id,
 | 
				
			||||||
 | 
					                        start=state.start,
 | 
				
			||||||
 | 
					                        length=state.length
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
                # `align_matches` always corresponds to `matches` 1:1
 | 
					                # `align_matches` always corresponds to `matches` 1:1
 | 
				
			||||||
                if with_alignments != 0:
 | 
					                if with_alignments != 0:
 | 
				
			||||||
                    align_matches.push_back(align_states[q])
 | 
					                    align_matches.push_back(align_states[q])
 | 
				
			||||||
| 
						 | 
					@ -533,8 +569,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
 | 
				
			||||||
            align_states.push_back(align_new_states[i])
 | 
					            align_states.push_back(align_new_states[i])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef int update_predicate_cache(int8_t* cache,
 | 
					cdef int update_predicate_cache(
 | 
				
			||||||
        const TokenPatternC* pattern, Token token, predicates) except -1:
 | 
					    int8_t* cache,
 | 
				
			||||||
 | 
					    const TokenPatternC* pattern,
 | 
				
			||||||
 | 
					    Token token,
 | 
				
			||||||
 | 
					    predicates
 | 
				
			||||||
 | 
					) except -1:
 | 
				
			||||||
    # If the state references any extra predicates, check whether they match.
 | 
					    # If the state references any extra predicates, check whether they match.
 | 
				
			||||||
    # These are cached, so that we don't call these potentially expensive
 | 
					    # These are cached, so that we don't call these potentially expensive
 | 
				
			||||||
    # Python functions more than we need to.
 | 
					    # Python functions more than we need to.
 | 
				
			||||||
| 
						 | 
					@ -580,10 +620,12 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                state.pattern += 1
 | 
					                state.pattern += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef action_t get_action(
 | 
				
			||||||
cdef action_t get_action(PatternStateC state,
 | 
					    PatternStateC state,
 | 
				
			||||||
        const TokenC* token, const attr_t* extra_attrs,
 | 
					    const TokenC * token,
 | 
				
			||||||
        const int8_t* predicate_matches) nogil:
 | 
					    const attr_t * extra_attrs,
 | 
				
			||||||
 | 
					    const int8_t * predicate_matches
 | 
				
			||||||
 | 
					) nogil:
 | 
				
			||||||
    """We need to consider:
 | 
					    """We need to consider:
 | 
				
			||||||
    a) Does the token match the specification? [Yes, No]
 | 
					    a) Does the token match the specification? [Yes, No]
 | 
				
			||||||
    b) What's the quantifier? [1, 0+, ?]
 | 
					    b) What's the quantifier? [1, 0+, ?]
 | 
				
			||||||
| 
						 | 
					@ -693,9 +735,12 @@ cdef action_t get_action(PatternStateC state,
 | 
				
			||||||
            return RETRY
 | 
					            return RETRY
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef int8_t get_is_match(PatternStateC state,
 | 
					cdef int8_t get_is_match(
 | 
				
			||||||
        const TokenC* token, const attr_t* extra_attrs,
 | 
					    PatternStateC state,
 | 
				
			||||||
        const int8_t* predicate_matches) nogil:
 | 
					    const TokenC* token,
 | 
				
			||||||
 | 
					    const attr_t* extra_attrs,
 | 
				
			||||||
 | 
					    const int8_t* predicate_matches
 | 
				
			||||||
 | 
					) nogil:
 | 
				
			||||||
    for i in range(state.pattern.nr_py):
 | 
					    for i in range(state.pattern.nr_py):
 | 
				
			||||||
        if predicate_matches[state.pattern.py_predicates[i]] == -1:
 | 
					        if predicate_matches[state.pattern.py_predicates[i]] == -1:
 | 
				
			||||||
            return 0
 | 
					            return 0
 | 
				
			||||||
| 
						 | 
					@ -1101,8 +1146,9 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
 | 
				
			||||||
    return output
 | 
					    return output
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
 | 
					def _get_extension_extra_predicates(
 | 
				
			||||||
        seen_predicates):
 | 
					    spec, extra_predicates, predicate_types, seen_predicates
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
    output = []
 | 
					    output = []
 | 
				
			||||||
    for attr, value in spec.items():
 | 
					    for attr, value in spec.items():
 | 
				
			||||||
        if isinstance(value, dict):
 | 
					        if isinstance(value, dict):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,14 +1,12 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True
 | 
					# cython: infer_types=True, profile=True
 | 
				
			||||||
from libc.stdint cimport uintptr_t
 | 
					 | 
				
			||||||
from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
 | 
					from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..attrs cimport DEP, LEMMA, MORPH, ORTH, POS, TAG
 | 
					from ..attrs cimport DEP, LEMMA, MORPH, POS, TAG
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..attrs import IDS
 | 
					from ..attrs import IDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..structs cimport TokenC
 | 
					 | 
				
			||||||
from ..tokens.span cimport Span
 | 
					from ..tokens.span cimport Span
 | 
				
			||||||
from ..tokens.token cimport Token
 | 
					from ..tokens.token cimport Token
 | 
				
			||||||
from ..typedefs cimport attr_t
 | 
					from ..typedefs cimport attr_t
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -40,11 +40,16 @@ cdef ActivationsC alloc_activations(SizesC n) nogil
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef void free_activations(const ActivationsC* A) nogil
 | 
					cdef void free_activations(const ActivationsC* A) nogil
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
 | 
					cdef void predict_states(
 | 
				
			||||||
        const WeightsC* W, SizesC n) nogil
 | 
					    CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
 | 
				
			||||||
 | 
					) nogil
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
 | 
					cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef void cpu_log_loss(float* d_scores,
 | 
					cdef void cpu_log_loss(
 | 
				
			||||||
        const float* costs, const int* is_valid, const float* scores, int O) nogil
 | 
					    float* d_scores,
 | 
				
			||||||
 
 | 
					    const float* costs,
 | 
				
			||||||
 | 
					    const int* is_valid,
 | 
				
			||||||
 | 
					    const float* scores,
 | 
				
			||||||
 | 
					    int O
 | 
				
			||||||
 | 
					) nogil
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,13 +8,13 @@ from thinc.backends.linalg cimport Vec, VecVec
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
import numpy.random
 | 
					import numpy.random
 | 
				
			||||||
from thinc.api import CupyOps, Model, NumpyOps, get_ops
 | 
					from thinc.api import CupyOps, Model, NumpyOps
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..pipeline._parser_internals.stateclass cimport StateClass
 | 
					from ..pipeline._parser_internals.stateclass cimport StateClass
 | 
				
			||||||
from ..typedefs cimport class_t, hash_t, weight_t
 | 
					from ..typedefs cimport weight_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef WeightsC get_c_weights(model) except *:
 | 
					cdef WeightsC get_c_weights(model) except *:
 | 
				
			||||||
| 
						 | 
					@ -78,33 +78,48 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
 | 
				
			||||||
        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
 | 
					        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
 | 
				
			||||||
        A._max_size = n.states
 | 
					        A._max_size = n.states
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        A.token_ids = <int*>realloc(A.token_ids,
 | 
					        A.token_ids = <int*>realloc(
 | 
				
			||||||
            n.states * n.feats * sizeof(A.token_ids[0]))
 | 
					            A.token_ids, n.states * n.feats * sizeof(A.token_ids[0])
 | 
				
			||||||
        A.scores = <float*>realloc(A.scores,
 | 
					        )
 | 
				
			||||||
            n.states * n.classes * sizeof(A.scores[0]))
 | 
					        A.scores = <float*>realloc(
 | 
				
			||||||
        A.unmaxed = <float*>realloc(A.unmaxed,
 | 
					            A.scores, n.states * n.classes * sizeof(A.scores[0])
 | 
				
			||||||
            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
 | 
					        )
 | 
				
			||||||
        A.hiddens = <float*>realloc(A.hiddens,
 | 
					        A.unmaxed = <float*>realloc(
 | 
				
			||||||
            n.states * n.hiddens * sizeof(A.hiddens[0]))
 | 
					            A.unmaxed, n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0])
 | 
				
			||||||
        A.is_valid = <int*>realloc(A.is_valid,
 | 
					        )
 | 
				
			||||||
            n.states * n.classes * sizeof(A.is_valid[0]))
 | 
					        A.hiddens = <float*>realloc(
 | 
				
			||||||
 | 
					            A.hiddens, n.states * n.hiddens * sizeof(A.hiddens[0])
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        A.is_valid = <int*>realloc(
 | 
				
			||||||
 | 
					            A.is_valid, n.states * n.classes * sizeof(A.is_valid[0])
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        A._max_size = n.states
 | 
					        A._max_size = n.states
 | 
				
			||||||
    A._curr_size = n.states
 | 
					    A._curr_size = n.states
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
 | 
					cdef void predict_states(
 | 
				
			||||||
        const WeightsC* W, SizesC n) nogil:
 | 
					    CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
 | 
				
			||||||
    cdef double one = 1.0
 | 
					) nogil:
 | 
				
			||||||
    resize_activations(A, n)
 | 
					    resize_activations(A, n)
 | 
				
			||||||
    for i in range(n.states):
 | 
					    for i in range(n.states):
 | 
				
			||||||
        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
 | 
					        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
 | 
				
			||||||
    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
 | 
					    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
 | 
				
			||||||
    memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
 | 
					    memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
 | 
				
			||||||
    sum_state_features(cblas, A.unmaxed,
 | 
					    sum_state_features(
 | 
				
			||||||
        W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
 | 
					        cblas,
 | 
				
			||||||
 | 
					        A.unmaxed,
 | 
				
			||||||
 | 
					        W.feat_weights,
 | 
				
			||||||
 | 
					        A.token_ids,
 | 
				
			||||||
 | 
					        n.states,
 | 
				
			||||||
 | 
					        n.feats,
 | 
				
			||||||
 | 
					        n.hiddens * n.pieces
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    for i in range(n.states):
 | 
					    for i in range(n.states):
 | 
				
			||||||
        VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces],
 | 
					        VecVec.add_i(
 | 
				
			||||||
            W.feat_bias, 1., n.hiddens * n.pieces)
 | 
					            &A.unmaxed[i*n.hiddens*n.pieces],
 | 
				
			||||||
 | 
					            W.feat_bias, 1.,
 | 
				
			||||||
 | 
					            n.hiddens * n.pieces
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        for j in range(n.hiddens):
 | 
					        for j in range(n.hiddens):
 | 
				
			||||||
            index = i * n.hiddens * n.pieces + j * n.pieces
 | 
					            index = i * n.hiddens * n.pieces + j * n.pieces
 | 
				
			||||||
            which = Vec.arg_max(&A.unmaxed[index], n.pieces)
 | 
					            which = Vec.arg_max(&A.unmaxed[index], n.pieces)
 | 
				
			||||||
| 
						 | 
					@ -114,14 +129,15 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
 | 
				
			||||||
        memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
 | 
					        memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        # Compute hidden-to-output
 | 
					        # Compute hidden-to-output
 | 
				
			||||||
        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
 | 
					        sgemm(cblas)(
 | 
				
			||||||
 | 
					            False, True, n.states, n.classes, n.hiddens,
 | 
				
			||||||
            1.0, <const float *>A.hiddens, n.hiddens,
 | 
					            1.0, <const float *>A.hiddens, n.hiddens,
 | 
				
			||||||
            <const float *>W.hidden_weights, n.hiddens,
 | 
					            <const float *>W.hidden_weights, n.hiddens,
 | 
				
			||||||
            0.0, A.scores, n.classes)
 | 
					            0.0, A.scores, n.classes
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        # Add bias
 | 
					        # Add bias
 | 
				
			||||||
        for i in range(n.states):
 | 
					        for i in range(n.states):
 | 
				
			||||||
            VecVec.add_i(&A.scores[i*n.classes],
 | 
					            VecVec.add_i(&A.scores[i*n.classes], W.hidden_bias, 1., n.classes)
 | 
				
			||||||
                W.hidden_bias, 1., n.classes)
 | 
					 | 
				
			||||||
    # Set unseen classes to minimum value
 | 
					    # Set unseen classes to minimum value
 | 
				
			||||||
    i = 0
 | 
					    i = 0
 | 
				
			||||||
    min_ = A.scores[0]
 | 
					    min_ = A.scores[0]
 | 
				
			||||||
| 
						 | 
					@ -134,9 +150,16 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
 | 
				
			||||||
                A.scores[i*n.classes+j] = min_
 | 
					                A.scores[i*n.classes+j] = min_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef void sum_state_features(CBlas cblas, float* output,
 | 
					cdef void sum_state_features(
 | 
				
			||||||
        const float* cached, const int* token_ids, int B, int F, int O) nogil:
 | 
					    CBlas cblas,
 | 
				
			||||||
    cdef int idx, b, f, i
 | 
					    float* output,
 | 
				
			||||||
 | 
					    const float* cached,
 | 
				
			||||||
 | 
					    const int* token_ids,
 | 
				
			||||||
 | 
					    int B,
 | 
				
			||||||
 | 
					    int F,
 | 
				
			||||||
 | 
					    int O
 | 
				
			||||||
 | 
					) nogil:
 | 
				
			||||||
 | 
					    cdef int idx, b, f
 | 
				
			||||||
    cdef const float* feature
 | 
					    cdef const float* feature
 | 
				
			||||||
    padding = cached
 | 
					    padding = cached
 | 
				
			||||||
    cached += F * O
 | 
					    cached += F * O
 | 
				
			||||||
| 
						 | 
					@ -153,9 +176,13 @@ cdef void sum_state_features(CBlas cblas, float* output,
 | 
				
			||||||
        token_ids += F
 | 
					        token_ids += F
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef void cpu_log_loss(float* d_scores,
 | 
					cdef void cpu_log_loss(
 | 
				
			||||||
        const float* costs, const int* is_valid, const float* scores,
 | 
					    float* d_scores,
 | 
				
			||||||
        int O) nogil:
 | 
					    const float* costs,
 | 
				
			||||||
 | 
					    const int* is_valid,
 | 
				
			||||||
 | 
					    const float* scores,
 | 
				
			||||||
 | 
					    int O
 | 
				
			||||||
 | 
					) nogil:
 | 
				
			||||||
    """Do multi-label log loss"""
 | 
					    """Do multi-label log loss"""
 | 
				
			||||||
    cdef double max_, gmax, Z, gZ
 | 
					    cdef double max_, gmax, Z, gZ
 | 
				
			||||||
    best = arg_max_if_gold(scores, costs, is_valid, O)
 | 
					    best = arg_max_if_gold(scores, costs, is_valid, O)
 | 
				
			||||||
| 
						 | 
					@ -179,8 +206,9 @@ cdef void cpu_log_loss(float* d_scores,
 | 
				
			||||||
            d_scores[i] = exp(scores[i]-max_) / Z
 | 
					            d_scores[i] = exp(scores[i]-max_) / Z
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
 | 
					cdef int arg_max_if_gold(
 | 
				
			||||||
        const int* is_valid, int n) nogil:
 | 
					    const weight_t* scores, const weight_t* costs, const int* is_valid, int n
 | 
				
			||||||
 | 
					) nogil:
 | 
				
			||||||
    # Find minimum cost
 | 
					    # Find minimum cost
 | 
				
			||||||
    cdef float cost = 1
 | 
					    cdef float cost = 1
 | 
				
			||||||
    for i in range(n):
 | 
					    for i in range(n):
 | 
				
			||||||
| 
						 | 
					@ -204,10 +232,17 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
 | 
				
			||||||
    return best
 | 
					    return best
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
class ParserStepModel(Model):
 | 
					class ParserStepModel(Model):
 | 
				
			||||||
    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
 | 
					    def __init__(
 | 
				
			||||||
            dropout=0.1):
 | 
					        self,
 | 
				
			||||||
 | 
					        docs,
 | 
				
			||||||
 | 
					        layers,
 | 
				
			||||||
 | 
					        *,
 | 
				
			||||||
 | 
					        has_upper,
 | 
				
			||||||
 | 
					        unseen_classes=None,
 | 
				
			||||||
 | 
					        train=True,
 | 
				
			||||||
 | 
					        dropout=0.1
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
        Model.__init__(self, name="parser_step_model", forward=step_forward)
 | 
					        Model.__init__(self, name="parser_step_model", forward=step_forward)
 | 
				
			||||||
        self.attrs["has_upper"] = has_upper
 | 
					        self.attrs["has_upper"] = has_upper
 | 
				
			||||||
        self.attrs["dropout_rate"] = dropout
 | 
					        self.attrs["dropout_rate"] = dropout
 | 
				
			||||||
| 
						 | 
					@ -268,8 +303,10 @@ class ParserStepModel(Model):
 | 
				
			||||||
        return ids
 | 
					        return ids
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
 | 
					    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
 | 
				
			||||||
        if isinstance(self.state2vec.ops, CupyOps) \
 | 
					        if (
 | 
				
			||||||
        and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
 | 
					            isinstance(self.state2vec.ops, CupyOps)
 | 
				
			||||||
 | 
					            and not isinstance(token_ids, self.state2vec.ops.xp.ndarray)
 | 
				
			||||||
 | 
					        ):
 | 
				
			||||||
            # Move token_ids and d_vector to GPU, asynchronously
 | 
					            # Move token_ids and d_vector to GPU, asynchronously
 | 
				
			||||||
            self.backprops.append((
 | 
					            self.backprops.append((
 | 
				
			||||||
                util.get_async(self.cuda_stream, token_ids),
 | 
					                util.get_async(self.cuda_stream, token_ids),
 | 
				
			||||||
| 
						 | 
					@ -279,7 +316,6 @@ class ParserStepModel(Model):
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            self.backprops.append((token_ids, d_vector, get_d_tokvecs))
 | 
					            self.backprops.append((token_ids, d_vector, get_d_tokvecs))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    def finish_steps(self, golds):
 | 
					    def finish_steps(self, golds):
 | 
				
			||||||
        # Add a padding vector to the d_tokvecs gradient, so that missing
 | 
					        # Add a padding vector to the d_tokvecs gradient, so that missing
 | 
				
			||||||
        # values don't affect the real gradient.
 | 
					        # values don't affect the real gradient.
 | 
				
			||||||
| 
						 | 
					@ -292,14 +328,15 @@ class ParserStepModel(Model):
 | 
				
			||||||
            ids = ids.flatten()
 | 
					            ids = ids.flatten()
 | 
				
			||||||
            d_state_features = d_state_features.reshape(
 | 
					            d_state_features = d_state_features.reshape(
 | 
				
			||||||
                (ids.size, d_state_features.shape[2]))
 | 
					                (ids.size, d_state_features.shape[2]))
 | 
				
			||||||
            self.ops.scatter_add(d_tokvecs, ids,
 | 
					            self.ops.scatter_add(d_tokvecs, ids, d_state_features)
 | 
				
			||||||
                d_state_features)
 | 
					 | 
				
			||||||
        # Padded -- see update()
 | 
					        # Padded -- see update()
 | 
				
			||||||
        self.bp_tokvecs(d_tokvecs[:-1])
 | 
					        self.bp_tokvecs(d_tokvecs[:-1])
 | 
				
			||||||
        return d_tokvecs
 | 
					        return d_tokvecs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
NUMPY_OPS = NumpyOps()
 | 
					NUMPY_OPS = NumpyOps()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def step_forward(model: ParserStepModel, states, is_train):
 | 
					def step_forward(model: ParserStepModel, states, is_train):
 | 
				
			||||||
    token_ids = model.get_token_ids(states)
 | 
					    token_ids = model.get_token_ids(states)
 | 
				
			||||||
    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
 | 
					    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
 | 
				
			||||||
| 
						 | 
					@ -312,7 +349,7 @@ def step_forward(model: ParserStepModel, states, is_train):
 | 
				
			||||||
        scores, get_d_vector = model.vec2scores(vector, is_train)
 | 
					        scores, get_d_vector = model.vec2scores(vector, is_train)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        scores = NumpyOps().asarray(vector)
 | 
					        scores = NumpyOps().asarray(vector)
 | 
				
			||||||
        get_d_vector = lambda d_scores: d_scores
 | 
					        get_d_vector = lambda d_scores: d_scores  # no-cython-lint: E731
 | 
				
			||||||
    # If the class is unseen, make sure its score is minimum
 | 
					    # If the class is unseen, make sure its score is minimum
 | 
				
			||||||
    scores[:, model._class_mask == 0] = numpy.nanmin(scores)
 | 
					    scores[:, model._class_mask == 0] = numpy.nanmin(scores)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -448,9 +485,11 @@ cdef class precompute_hiddens:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        feat_weights = self.get_feat_weights()
 | 
					        feat_weights = self.get_feat_weights()
 | 
				
			||||||
        cdef int[:, ::1] ids = token_ids
 | 
					        cdef int[:, ::1] ids = token_ids
 | 
				
			||||||
        sum_state_features(cblas, <float*>state_vector.data,
 | 
					        sum_state_features(
 | 
				
			||||||
 | 
					            cblas, <float*>state_vector.data,
 | 
				
			||||||
            feat_weights, &ids[0, 0],
 | 
					            feat_weights, &ids[0, 0],
 | 
				
			||||||
            token_ids.shape[0], self.nF, self.nO*self.nP)
 | 
					            token_ids.shape[0], self.nF, self.nO*self.nP
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        state_vector += self.bias
 | 
					        state_vector += self.bias
 | 
				
			||||||
        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
 | 
					        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -20,4 +20,8 @@ cdef class Morphology:
 | 
				
			||||||
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
 | 
					cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
 | 
				
			||||||
cdef list list_features(const MorphAnalysisC* morph)
 | 
					cdef list list_features(const MorphAnalysisC* morph)
 | 
				
			||||||
cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
 | 
					cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
 | 
				
			||||||
cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil
 | 
					cdef int get_n_by_field(
 | 
				
			||||||
 | 
					    attr_t* results,
 | 
				
			||||||
 | 
					    const MorphAnalysisC* morph,
 | 
				
			||||||
 | 
					    attr_t field,
 | 
				
			||||||
 | 
					) nogil
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -83,10 +83,11 @@ cdef class Morphology:
 | 
				
			||||||
        features = self.normalize_attrs(features)
 | 
					        features = self.normalize_attrs(features)
 | 
				
			||||||
        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
 | 
					        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
 | 
				
			||||||
        # normalized UFEATS string with sorted fields and values
 | 
					        # normalized UFEATS string with sorted fields and values
 | 
				
			||||||
        norm_feats_string = self.FEATURE_SEP.join(sorted([
 | 
					        norm_feats_string = self.FEATURE_SEP.join(
 | 
				
			||||||
                self.FIELD_SEP.join([field, values])
 | 
					            sorted(
 | 
				
			||||||
            for field, values in string_features.items()
 | 
					                [self.FIELD_SEP.join([field, values]) for field, values in string_features.items()]
 | 
				
			||||||
        ]))
 | 
					            )
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        return norm_feats_string or self.EMPTY_MORPH
 | 
					        return norm_feats_string or self.EMPTY_MORPH
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def normalize_attrs(self, attrs):
 | 
					    def normalize_attrs(self, attrs):
 | 
				
			||||||
| 
						 | 
					@ -192,6 +193,7 @@ cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t fie
 | 
				
			||||||
            n_results += 1
 | 
					            n_results += 1
 | 
				
			||||||
    return n_results
 | 
					    return n_results
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def unpickle_morphology(strings, tags):
 | 
					def unpickle_morphology(strings, tags):
 | 
				
			||||||
    cdef Morphology morphology = Morphology(strings)
 | 
					    cdef Morphology morphology = Morphology(strings)
 | 
				
			||||||
    for tag in tags:
 | 
					    for tag in tags:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -46,11 +46,18 @@ cdef struct EditTreeC:
 | 
				
			||||||
    bint is_match_node
 | 
					    bint is_match_node
 | 
				
			||||||
    NodeC inner
 | 
					    NodeC inner
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef inline EditTreeC edittree_new_match(len_t prefix_len, len_t suffix_len,
 | 
					cdef inline EditTreeC edittree_new_match(
 | 
				
			||||||
        uint32_t prefix_tree, uint32_t suffix_tree):
 | 
					    len_t prefix_len,
 | 
				
			||||||
    cdef MatchNodeC match_node = MatchNodeC(prefix_len=prefix_len,
 | 
					    len_t suffix_len,
 | 
				
			||||||
            suffix_len=suffix_len, prefix_tree=prefix_tree,
 | 
					    uint32_t prefix_tree,
 | 
				
			||||||
            suffix_tree=suffix_tree)
 | 
					    uint32_t suffix_tree
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    cdef MatchNodeC match_node = MatchNodeC(
 | 
				
			||||||
 | 
					        prefix_len=prefix_len,
 | 
				
			||||||
 | 
					        suffix_len=suffix_len,
 | 
				
			||||||
 | 
					        prefix_tree=prefix_tree,
 | 
				
			||||||
 | 
					        suffix_tree=suffix_tree
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    cdef NodeC inner = NodeC(match_node=match_node)
 | 
					    cdef NodeC inner = NodeC(match_node=match_node)
 | 
				
			||||||
    return EditTreeC(is_match_node=True, inner=inner)
 | 
					    return EditTreeC(is_match_node=True, inner=inner)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,8 +5,6 @@ from libc.string cimport memset
 | 
				
			||||||
from libcpp.pair cimport pair
 | 
					from libcpp.pair cimport pair
 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					from libcpp.vector cimport vector
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...typedefs cimport hash_t
 | 
					from ...typedefs cimport hash_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ... import util
 | 
					from ... import util
 | 
				
			||||||
| 
						 | 
					@ -25,17 +23,16 @@ cdef LCS find_lcs(str source, str target):
 | 
				
			||||||
    target (str): The second string.
 | 
					    target (str): The second string.
 | 
				
			||||||
    RETURNS (LCS): The spans of the longest common subsequences.
 | 
					    RETURNS (LCS): The spans of the longest common subsequences.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    cdef Py_ssize_t source_len = len(source)
 | 
					 | 
				
			||||||
    cdef Py_ssize_t target_len = len(target)
 | 
					    cdef Py_ssize_t target_len = len(target)
 | 
				
			||||||
    cdef size_t longest_align = 0;
 | 
					    cdef size_t longest_align = 0
 | 
				
			||||||
    cdef int source_idx, target_idx
 | 
					    cdef int source_idx, target_idx
 | 
				
			||||||
    cdef LCS lcs
 | 
					    cdef LCS lcs
 | 
				
			||||||
    cdef Py_UCS4 source_cp, target_cp
 | 
					    cdef Py_UCS4 source_cp, target_cp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    memset(&lcs, 0, sizeof(lcs))
 | 
					    memset(&lcs, 0, sizeof(lcs))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef vector[size_t] prev_aligns = vector[size_t](target_len);
 | 
					    cdef vector[size_t] prev_aligns = vector[size_t](target_len)
 | 
				
			||||||
    cdef vector[size_t] cur_aligns = vector[size_t](target_len);
 | 
					    cdef vector[size_t] cur_aligns = vector[size_t](target_len)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (source_idx, source_cp) in enumerate(source):
 | 
					    for (source_idx, source_cp) in enumerate(source):
 | 
				
			||||||
        for (target_idx, target_cp) in enumerate(target):
 | 
					        for (target_idx, target_cp) in enumerate(target):
 | 
				
			||||||
| 
						 | 
					@ -89,7 +86,7 @@ cdef class EditTrees:
 | 
				
			||||||
        cdef LCS lcs = find_lcs(form, lemma)
 | 
					        cdef LCS lcs = find_lcs(form, lemma)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cdef EditTreeC tree
 | 
					        cdef EditTreeC tree
 | 
				
			||||||
        cdef uint32_t tree_id, prefix_tree, suffix_tree
 | 
					        cdef uint32_t prefix_tree, suffix_tree
 | 
				
			||||||
        if lcs_is_empty(lcs):
 | 
					        if lcs_is_empty(lcs):
 | 
				
			||||||
            tree = edittree_new_subst(self.strings.add(form), self.strings.add(lemma))
 | 
					            tree = edittree_new_subst(self.strings.add(form), self.strings.add(lemma))
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
| 
						 | 
					@ -289,6 +286,7 @@ def _tree2dict(tree):
 | 
				
			||||||
        tree = tree["inner"]["subst_node"]
 | 
					        tree = tree["inner"]["subst_node"]
 | 
				
			||||||
    return(dict(tree))
 | 
					    return(dict(tree))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _dict2tree(tree):
 | 
					def _dict2tree(tree):
 | 
				
			||||||
    errors = validate_edit_tree(tree)
 | 
					    errors = validate_edit_tree(tree)
 | 
				
			||||||
    if errors:
 | 
					    if errors:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,14 @@
 | 
				
			||||||
# cython: infer_types=True
 | 
					# cython: infer_types=True
 | 
				
			||||||
# cython: profile=True
 | 
					# cython: profile=True
 | 
				
			||||||
cimport numpy as np
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from cpython.ref cimport Py_XDECREF, PyObject
 | 
					 | 
				
			||||||
from thinc.extra.search cimport Beam
 | 
					from thinc.extra.search cimport Beam
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.extra.search import MaxViolation
 | 
					from thinc.extra.search import MaxViolation
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.extra.search cimport MaxViolation
 | 
					from thinc.extra.search cimport MaxViolation
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...typedefs cimport class_t, hash_t
 | 
					from ...typedefs cimport class_t
 | 
				
			||||||
from .transition_system cimport Transition, TransitionSystem
 | 
					from .transition_system cimport Transition, TransitionSystem
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
| 
						 | 
					@ -146,7 +143,6 @@ def update_beam(TransitionSystem moves, states, golds, model, int width, beam_de
 | 
				
			||||||
    cdef MaxViolation violn
 | 
					    cdef MaxViolation violn
 | 
				
			||||||
    pbeam = BeamBatch(moves, states, golds, width=width, density=beam_density)
 | 
					    pbeam = BeamBatch(moves, states, golds, width=width, density=beam_density)
 | 
				
			||||||
    gbeam = BeamBatch(moves, states, golds, width=width, density=0.0)
 | 
					    gbeam = BeamBatch(moves, states, golds, width=width, density=0.0)
 | 
				
			||||||
    cdef StateClass state
 | 
					 | 
				
			||||||
    beam_maps = []
 | 
					    beam_maps = []
 | 
				
			||||||
    backprops = []
 | 
					    backprops = []
 | 
				
			||||||
    violns = [MaxViolation() for _ in range(len(states))]
 | 
					    violns = [MaxViolation() for _ in range(len(states))]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -277,7 +277,6 @@ cdef cppclass StateC:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return n
 | 
					        return n
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    int n_L(int head) nogil const:
 | 
					    int n_L(int head) nogil const:
 | 
				
			||||||
        return n_arcs(this._left_arcs, head)
 | 
					        return n_arcs(this._left_arcs, head)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,7 +9,7 @@ from ...strings cimport hash_string
 | 
				
			||||||
from ...structs cimport TokenC
 | 
					from ...structs cimport TokenC
 | 
				
			||||||
from ...tokens.doc cimport Doc, set_children_from_heads
 | 
					from ...tokens.doc cimport Doc, set_children_from_heads
 | 
				
			||||||
from ...tokens.token cimport MISSING_DEP
 | 
					from ...tokens.token cimport MISSING_DEP
 | 
				
			||||||
from ...typedefs cimport attr_t, hash_t
 | 
					from ...typedefs cimport attr_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...training import split_bilu_label
 | 
					from ...training import split_bilu_label
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -68,8 +68,9 @@ cdef struct GoldParseStateC:
 | 
				
			||||||
    weight_t pop_cost
 | 
					    weight_t pop_cost
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state,
 | 
					cdef GoldParseStateC create_gold_state(
 | 
				
			||||||
        heads, labels, sent_starts) except *:
 | 
					    Pool mem, const StateC* state, heads, labels, sent_starts
 | 
				
			||||||
 | 
					) except *:
 | 
				
			||||||
    cdef GoldParseStateC gs
 | 
					    cdef GoldParseStateC gs
 | 
				
			||||||
    gs.length = len(heads)
 | 
					    gs.length = len(heads)
 | 
				
			||||||
    gs.stride = 1
 | 
					    gs.stride = 1
 | 
				
			||||||
| 
						 | 
					@ -82,7 +83,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state,
 | 
				
			||||||
    gs.n_kids_in_stack = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0]))
 | 
					    gs.n_kids_in_stack = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for i, is_sent_start in enumerate(sent_starts):
 | 
					    for i, is_sent_start in enumerate(sent_starts):
 | 
				
			||||||
        if is_sent_start == True:
 | 
					        if is_sent_start is True:
 | 
				
			||||||
            gs.state_bits[i] = set_state_flag(
 | 
					            gs.state_bits[i] = set_state_flag(
 | 
				
			||||||
                gs.state_bits[i],
 | 
					                gs.state_bits[i],
 | 
				
			||||||
                IS_SENT_START,
 | 
					                IS_SENT_START,
 | 
				
			||||||
| 
						 | 
					@ -210,6 +211,7 @@ cdef class ArcEagerGold:
 | 
				
			||||||
    def update(self, StateClass stcls):
 | 
					    def update(self, StateClass stcls):
 | 
				
			||||||
        update_gold_state(&self.c, stcls.c)
 | 
					        update_gold_state(&self.c, stcls.c)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _get_aligned_sent_starts(example):
 | 
					def _get_aligned_sent_starts(example):
 | 
				
			||||||
    """Get list of SENT_START attributes aligned to the predicted tokenization.
 | 
					    """Get list of SENT_START attributes aligned to the predicted tokenization.
 | 
				
			||||||
    If the reference has not sentence starts, return a list of None values.
 | 
					    If the reference has not sentence starts, return a list of None values.
 | 
				
			||||||
| 
						 | 
					@ -524,7 +526,6 @@ cdef class Break:
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
 | 
					    cdef bint is_valid(const StateC* st, attr_t label) nogil:
 | 
				
			||||||
        cdef int i
 | 
					 | 
				
			||||||
        if st.buffer_length() < 2:
 | 
					        if st.buffer_length() < 2:
 | 
				
			||||||
            return False
 | 
					            return False
 | 
				
			||||||
        elif st.B(1) != st.B(0) + 1:
 | 
					        elif st.B(1) != st.B(0) + 1:
 | 
				
			||||||
| 
						 | 
					@ -556,8 +557,8 @@ cdef class Break:
 | 
				
			||||||
                cost -= 1
 | 
					                cost -= 1
 | 
				
			||||||
            if gold.heads[si] == b0:
 | 
					            if gold.heads[si] == b0:
 | 
				
			||||||
                cost -= 1
 | 
					                cost -= 1
 | 
				
			||||||
        if not is_sent_start(gold, state.B(1)) \
 | 
					        if not is_sent_start(gold, state.B(1)) and\
 | 
				
			||||||
        and not is_sent_start_unknown(gold, state.B(1)):
 | 
					                not is_sent_start_unknown(gold, state.B(1)):
 | 
				
			||||||
            cost += 1
 | 
					            cost += 1
 | 
				
			||||||
        return cost
 | 
					        return cost
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -803,7 +804,6 @@ cdef class ArcEager(TransitionSystem):
 | 
				
			||||||
            raise TypeError(Errors.E909.format(name="ArcEagerGold"))
 | 
					            raise TypeError(Errors.E909.format(name="ArcEagerGold"))
 | 
				
			||||||
        cdef ArcEagerGold gold_ = gold
 | 
					        cdef ArcEagerGold gold_ = gold
 | 
				
			||||||
        gold_state = gold_.c
 | 
					        gold_state = gold_.c
 | 
				
			||||||
        n_gold = 0
 | 
					 | 
				
			||||||
        if self.c[i].is_valid(stcls.c, self.c[i].label):
 | 
					        if self.c[i].is_valid(stcls.c, self.c[i].label):
 | 
				
			||||||
            cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
 | 
					            cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
| 
						 | 
					@ -875,7 +875,7 @@ cdef class ArcEager(TransitionSystem):
 | 
				
			||||||
            print("Gold")
 | 
					            print("Gold")
 | 
				
			||||||
            for token in example.y:
 | 
					            for token in example.y:
 | 
				
			||||||
                print(token.i, token.text, token.dep_, token.head.text)
 | 
					                print(token.i, token.text, token.dep_, token.head.text)
 | 
				
			||||||
            aligned_heads, aligned_labels = example.get_aligned_parse()
 | 
					            aligned_heads, _aligned_labels = example.get_aligned_parse()
 | 
				
			||||||
            print("Aligned heads")
 | 
					            print("Aligned heads")
 | 
				
			||||||
            for i, head in enumerate(aligned_heads):
 | 
					            for i, head in enumerate(aligned_heads):
 | 
				
			||||||
                print(example.x[i], example.x[head] if head is not None else "__")
 | 
					                print(example.x[i], example.x[head] if head is not None else "__")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
import os
 | 
					 | 
				
			||||||
import random
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
from libc.stdint cimport int32_t
 | 
					from libc.stdint cimport int32_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,7 +11,7 @@ from ...tokens.span import Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...attrs cimport IS_SPACE
 | 
					from ...attrs cimport IS_SPACE
 | 
				
			||||||
from ...lexeme cimport Lexeme
 | 
					from ...lexeme cimport Lexeme
 | 
				
			||||||
from ...structs cimport SpanC, TokenC
 | 
					from ...structs cimport SpanC
 | 
				
			||||||
from ...tokens.span cimport Span
 | 
					from ...tokens.span cimport Span
 | 
				
			||||||
from ...typedefs cimport attr_t, weight_t
 | 
					from ...typedefs cimport attr_t, weight_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -145,7 +142,6 @@ cdef class BiluoPushDown(TransitionSystem):
 | 
				
			||||||
        for entity_type in kwargs.get('entity_types', []):
 | 
					        for entity_type in kwargs.get('entity_types', []):
 | 
				
			||||||
            for action in (BEGIN, IN, LAST, UNIT):
 | 
					            for action in (BEGIN, IN, LAST, UNIT):
 | 
				
			||||||
                actions[action][entity_type] = 1
 | 
					                actions[action][entity_type] = 1
 | 
				
			||||||
        moves = ('M', 'B', 'I', 'L', 'U')
 | 
					 | 
				
			||||||
        for example in kwargs.get('examples', []):
 | 
					        for example in kwargs.get('examples', []):
 | 
				
			||||||
            for token in example.y:
 | 
					            for token in example.y:
 | 
				
			||||||
                ent_type = token.ent_type_
 | 
					                ent_type = token.ent_type_
 | 
				
			||||||
| 
						 | 
					@ -325,7 +321,6 @@ cdef class BiluoPushDown(TransitionSystem):
 | 
				
			||||||
            raise TypeError(Errors.E909.format(name="BiluoGold"))
 | 
					            raise TypeError(Errors.E909.format(name="BiluoGold"))
 | 
				
			||||||
        cdef BiluoGold gold_ = gold
 | 
					        cdef BiluoGold gold_ = gold
 | 
				
			||||||
        gold_state = gold_.c
 | 
					        gold_state = gold_.c
 | 
				
			||||||
        n_gold = 0
 | 
					 | 
				
			||||||
        if self.c[i].is_valid(stcls.c, self.c[i].label):
 | 
					        if self.c[i].is_valid(stcls.c, self.c[i].label):
 | 
				
			||||||
            cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
 | 
					            cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
| 
						 | 
					@ -486,10 +481,8 @@ cdef class In:
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
 | 
					    cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
 | 
				
			||||||
        gold = <GoldNERStateC*>_gold
 | 
					        gold = <GoldNERStateC*>_gold
 | 
				
			||||||
        move = IN
 | 
					 | 
				
			||||||
        cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT
 | 
					        cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT
 | 
				
			||||||
        cdef int g_act = gold.ner[s.B(0)].move
 | 
					        cdef int g_act = gold.ner[s.B(0)].move
 | 
				
			||||||
        cdef attr_t g_tag = gold.ner[s.B(0)].label
 | 
					 | 
				
			||||||
        cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
 | 
					        cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if g_act == MISSING:
 | 
					        if g_act == MISSING:
 | 
				
			||||||
| 
						 | 
					@ -549,12 +542,10 @@ cdef class Last:
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
 | 
					    cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
 | 
				
			||||||
        gold = <GoldNERStateC*>_gold
 | 
					        gold = <GoldNERStateC*>_gold
 | 
				
			||||||
        move = LAST
 | 
					 | 
				
			||||||
        b0 = s.B(0)
 | 
					        b0 = s.B(0)
 | 
				
			||||||
        ent_start = s.E(0)
 | 
					        ent_start = s.E(0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cdef int g_act = gold.ner[b0].move
 | 
					        cdef int g_act = gold.ner[b0].move
 | 
				
			||||||
        cdef attr_t g_tag = gold.ner[b0].label
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cdef int cost = 0
 | 
					        cdef int cost = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -652,7 +643,6 @@ cdef class Unit:
 | 
				
			||||||
        return cost
 | 
					        return cost
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class Out:
 | 
					cdef class Out:
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
 | 
					    cdef bint is_valid(const StateC* st, attr_t label) nogil:
 | 
				
			||||||
| 
						 | 
					@ -675,7 +665,6 @@ cdef class Out:
 | 
				
			||||||
    cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
 | 
					    cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
 | 
				
			||||||
        gold = <GoldNERStateC*>_gold
 | 
					        gold = <GoldNERStateC*>_gold
 | 
				
			||||||
        cdef int g_act = gold.ner[s.B(0)].move
 | 
					        cdef int g_act = gold.ner[s.B(0)].move
 | 
				
			||||||
        cdef attr_t g_tag = gold.ner[s.B(0)].label
 | 
					 | 
				
			||||||
        cdef weight_t cost = 0
 | 
					        cdef weight_t cost = 0
 | 
				
			||||||
        if g_act == MISSING:
 | 
					        if g_act == MISSING:
 | 
				
			||||||
            pass
 | 
					            pass
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -125,14 +125,17 @@ def decompose(label):
 | 
				
			||||||
def is_decorated(label):
 | 
					def is_decorated(label):
 | 
				
			||||||
    return DELIMITER in label
 | 
					    return DELIMITER in label
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def count_decorated_labels(gold_data):
 | 
					def count_decorated_labels(gold_data):
 | 
				
			||||||
    freqs = {}
 | 
					    freqs = {}
 | 
				
			||||||
    for example in gold_data:
 | 
					    for example in gold_data:
 | 
				
			||||||
        proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
 | 
					        proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
 | 
				
			||||||
                                             example.get_aligned("DEP"))
 | 
					                                             example.get_aligned("DEP"))
 | 
				
			||||||
        # set the label to ROOT for each root dependent
 | 
					        # set the label to ROOT for each root dependent
 | 
				
			||||||
        deco_deps = ['ROOT' if head == i else deco_deps[i]
 | 
					        deco_deps = [
 | 
				
			||||||
                       for i, head in enumerate(proj_heads)]
 | 
					            'ROOT' if head == i else deco_deps[i]
 | 
				
			||||||
 | 
					            for i, head in enumerate(proj_heads)
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
        # count label frequencies
 | 
					        # count label frequencies
 | 
				
			||||||
        for label in deco_deps:
 | 
					        for label in deco_deps:
 | 
				
			||||||
            if is_decorated(label):
 | 
					            if is_decorated(label):
 | 
				
			||||||
| 
						 | 
					@ -160,9 +163,9 @@ def projectivize(heads, labels):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef vector[int] _heads_to_c(heads):
 | 
					cdef vector[int] _heads_to_c(heads):
 | 
				
			||||||
    cdef vector[int] c_heads;
 | 
					    cdef vector[int] c_heads
 | 
				
			||||||
    for head in heads:
 | 
					    for head in heads:
 | 
				
			||||||
        if head == None:
 | 
					        if head is None:
 | 
				
			||||||
            c_heads.push_back(-1)
 | 
					            c_heads.push_back(-1)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            assert head < len(heads)
 | 
					            assert head < len(heads)
 | 
				
			||||||
| 
						 | 
					@ -199,6 +202,7 @@ def _decorate(heads, proj_heads, labels):
 | 
				
			||||||
            deco_labels.append(labels[tokenid])
 | 
					            deco_labels.append(labels[tokenid])
 | 
				
			||||||
    return deco_labels
 | 
					    return deco_labels
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_smallest_nonproj_arc_slow(heads):
 | 
					def get_smallest_nonproj_arc_slow(heads):
 | 
				
			||||||
    cdef vector[int] c_heads = _heads_to_c(heads)
 | 
					    cdef vector[int] c_heads = _heads_to_c(heads)
 | 
				
			||||||
    return _get_smallest_nonproj_arc(c_heads)
 | 
					    return _get_smallest_nonproj_arc(c_heads)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True
 | 
					# cython: infer_types=True
 | 
				
			||||||
import numpy
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					from libcpp.vector cimport vector
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...tokens.doc cimport Doc
 | 
					from ...tokens.doc cimport Doc
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -20,11 +20,15 @@ cdef struct Transition:
 | 
				
			||||||
    int (*do)(StateC* state, attr_t label) nogil
 | 
					    int (*do)(StateC* state, attr_t label) nogil
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ctypedef weight_t (*get_cost_func_t)(const StateC* state, const void* gold,
 | 
					ctypedef weight_t (*get_cost_func_t)(
 | 
				
			||||||
        attr_tlabel) nogil
 | 
					    const StateC* state, const void* gold, attr_tlabel
 | 
				
			||||||
ctypedef weight_t (*move_cost_func_t)(const StateC* state, const void* gold) nogil
 | 
					) nogil
 | 
				
			||||||
ctypedef weight_t (*label_cost_func_t)(const StateC* state, const void*
 | 
					ctypedef weight_t (*move_cost_func_t)(
 | 
				
			||||||
        gold, attr_t label) nogil
 | 
					        const StateC* state, const void* gold
 | 
				
			||||||
 | 
					) nogil
 | 
				
			||||||
 | 
					ctypedef weight_t (*label_cost_func_t)(
 | 
				
			||||||
 | 
					    const StateC* state, const void* gold, attr_t label
 | 
				
			||||||
 | 
					) nogil
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
 | 
					ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,9 +8,7 @@ from collections import Counter
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...structs cimport TokenC
 | 
					from ...structs cimport TokenC
 | 
				
			||||||
from ...tokens.doc cimport Doc
 | 
					 | 
				
			||||||
from ...typedefs cimport attr_t, weight_t
 | 
					from ...typedefs cimport attr_t, weight_t
 | 
				
			||||||
from . cimport _beam_utils
 | 
					 | 
				
			||||||
from .stateclass cimport StateClass
 | 
					from .stateclass cimport StateClass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ... import util
 | 
					from ... import util
 | 
				
			||||||
| 
						 | 
					@ -231,7 +229,6 @@ cdef class TransitionSystem:
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def to_bytes(self, exclude=tuple()):
 | 
					    def to_bytes(self, exclude=tuple()):
 | 
				
			||||||
        transitions = []
 | 
					 | 
				
			||||||
        serializers = {
 | 
					        serializers = {
 | 
				
			||||||
            'moves': lambda: srsly.json_dumps(self.labels),
 | 
					            'moves': lambda: srsly.json_dumps(self.labels),
 | 
				
			||||||
            'strings': lambda: self.strings.to_bytes(),
 | 
					            'strings': lambda: self.strings.to_bytes(),
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True, binding=True
 | 
					# cython: infer_types=True, profile=True, binding=True
 | 
				
			||||||
from collections import defaultdict
 | 
					from collections import defaultdict
 | 
				
			||||||
from typing import Callable, Iterable, Optional
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.api import Config, Model
 | 
					from thinc.api import Config, Model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -124,6 +124,7 @@ def make_parser(
 | 
				
			||||||
        scorer=scorer,
 | 
					        scorer=scorer,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@Language.factory(
 | 
					@Language.factory(
 | 
				
			||||||
    "beam_parser",
 | 
					    "beam_parser",
 | 
				
			||||||
    assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
 | 
					    assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,7 +2,6 @@
 | 
				
			||||||
from itertools import islice
 | 
					from itertools import islice
 | 
				
			||||||
from typing import Callable, Dict, Optional, Union
 | 
					from typing import Callable, Dict, Optional, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 | 
					from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..morphology cimport Morphology
 | 
					from ..morphology cimport Morphology
 | 
				
			||||||
| 
						 | 
					@ -14,10 +13,8 @@ from ..errors import Errors
 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
from ..parts_of_speech import IDS as POS_IDS
 | 
					from ..parts_of_speech import IDS as POS_IDS
 | 
				
			||||||
from ..scorer import Scorer
 | 
					from ..scorer import Scorer
 | 
				
			||||||
from ..symbols import POS
 | 
					 | 
				
			||||||
from ..training import validate_examples, validate_get_examples
 | 
					from ..training import validate_examples, validate_get_examples
 | 
				
			||||||
from ..util import registry
 | 
					from ..util import registry
 | 
				
			||||||
from .pipe import deserialize_config
 | 
					 | 
				
			||||||
from .tagger import Tagger
 | 
					from .tagger import Tagger
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# See #9050
 | 
					# See #9050
 | 
				
			||||||
| 
						 | 
					@ -76,8 +73,11 @@ def morphologizer_score(examples, **kwargs):
 | 
				
			||||||
    results = {}
 | 
					    results = {}
 | 
				
			||||||
    results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
 | 
					    results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
 | 
				
			||||||
    results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
 | 
					    results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
 | 
				
			||||||
    results.update(Scorer.score_token_attr_per_feat(examples,
 | 
					    results.update(
 | 
				
			||||||
        "morph", getter=morph_key_getter, **kwargs))
 | 
					        Scorer.score_token_attr_per_feat(
 | 
				
			||||||
 | 
					            examples, "morph", getter=morph_key_getter, **kwargs
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    return results
 | 
					    return results
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -233,7 +233,6 @@ class Morphologizer(Tagger):
 | 
				
			||||||
        if isinstance(docs, Doc):
 | 
					        if isinstance(docs, Doc):
 | 
				
			||||||
            docs = [docs]
 | 
					            docs = [docs]
 | 
				
			||||||
        cdef Doc doc
 | 
					        cdef Doc doc
 | 
				
			||||||
        cdef Vocab vocab = self.vocab
 | 
					 | 
				
			||||||
        cdef bint overwrite = self.cfg["overwrite"]
 | 
					        cdef bint overwrite = self.cfg["overwrite"]
 | 
				
			||||||
        cdef bint extend = self.cfg["extend"]
 | 
					        cdef bint extend = self.cfg["extend"]
 | 
				
			||||||
        labels = self.labels
 | 
					        labels = self.labels
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,13 +4,10 @@ from typing import Optional
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
from thinc.api import Config, CosineDistance, Model, set_dropout_rate, to_categorical
 | 
					from thinc.api import Config, CosineDistance, Model, set_dropout_rate, to_categorical
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..tokens.doc cimport Doc
 | 
					from ..attrs import ID
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..attrs import ID, POS
 | 
					 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
from ..training import validate_examples
 | 
					from ..training import validate_examples
 | 
				
			||||||
from ._parser_internals import nonproj
 | 
					 | 
				
			||||||
from .tagger import Tagger
 | 
					from .tagger import Tagger
 | 
				
			||||||
from .trainable_pipe import TrainablePipe
 | 
					from .trainable_pipe import TrainablePipe
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -103,10 +100,9 @@ class MultitaskObjective(Tagger):
 | 
				
			||||||
        cdef int idx = 0
 | 
					        cdef int idx = 0
 | 
				
			||||||
        correct = numpy.zeros((scores.shape[0],), dtype="i")
 | 
					        correct = numpy.zeros((scores.shape[0],), dtype="i")
 | 
				
			||||||
        guesses = scores.argmax(axis=1)
 | 
					        guesses = scores.argmax(axis=1)
 | 
				
			||||||
        docs = [eg.predicted for eg in examples]
 | 
					 | 
				
			||||||
        for i, eg in enumerate(examples):
 | 
					        for i, eg in enumerate(examples):
 | 
				
			||||||
            # Handles alignment for tokenization differences
 | 
					            # Handles alignment for tokenization differences
 | 
				
			||||||
            doc_annots = eg.get_aligned()  # TODO
 | 
					            _doc_annots = eg.get_aligned()  # TODO
 | 
				
			||||||
            for j in range(len(eg.predicted)):
 | 
					            for j in range(len(eg.predicted)):
 | 
				
			||||||
                tok_annots = {key: values[j] for key, values in tok_annots.items()}
 | 
					                tok_annots = {key: values[j] for key, values in tok_annots.items()}
 | 
				
			||||||
                label = self.make_label(j, tok_annots)
 | 
					                label = self.make_label(j, tok_annots)
 | 
				
			||||||
| 
						 | 
					@ -206,7 +202,6 @@ class ClozeMultitask(TrainablePipe):
 | 
				
			||||||
            losses[self.name] = 0.
 | 
					            losses[self.name] = 0.
 | 
				
			||||||
        set_dropout_rate(self.model, drop)
 | 
					        set_dropout_rate(self.model, drop)
 | 
				
			||||||
        validate_examples(examples, "ClozeMultitask.rehearse")
 | 
					        validate_examples(examples, "ClozeMultitask.rehearse")
 | 
				
			||||||
        docs = [eg.predicted for eg in examples]
 | 
					 | 
				
			||||||
        predictions, bp_predictions = self.model.begin_update()
 | 
					        predictions, bp_predictions = self.model.begin_update()
 | 
				
			||||||
        loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
 | 
					        loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
 | 
				
			||||||
        bp_predictions(d_predictions)
 | 
					        bp_predictions(d_predictions)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True, binding=True
 | 
					# cython: infer_types=True, profile=True, binding=True
 | 
				
			||||||
from collections import defaultdict
 | 
					from collections import defaultdict
 | 
				
			||||||
from typing import Callable, Iterable, Optional
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.api import Config, Model
 | 
					from thinc.api import Config, Model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,7 +10,7 @@ from ._parser_internals.ner cimport BiluoPushDown
 | 
				
			||||||
from .transition_parser cimport Parser
 | 
					from .transition_parser cimport Parser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
from ..scorer import PRFScore, get_ner_prf
 | 
					from ..scorer import get_ner_prf
 | 
				
			||||||
from ..training import remove_bilu_prefix
 | 
					from ..training import remove_bilu_prefix
 | 
				
			||||||
from ..util import registry
 | 
					from ..util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -100,6 +100,7 @@ def make_ner(
 | 
				
			||||||
        scorer=scorer,
 | 
					        scorer=scorer,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@Language.factory(
 | 
					@Language.factory(
 | 
				
			||||||
    "beam_ner",
 | 
					    "beam_ner",
 | 
				
			||||||
    assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
 | 
					    assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True, binding=True
 | 
					# cython: infer_types=True, profile=True, binding=True
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple, Union
 | 
					from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,13 +7,13 @@ from ..tokens.doc cimport Doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
from ..scorer import Scorer
 | 
					 | 
				
			||||||
from .pipe import Pipe
 | 
					from .pipe import Pipe
 | 
				
			||||||
from .senter import senter_score
 | 
					from .senter import senter_score
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# see #9050
 | 
					# see #9050
 | 
				
			||||||
BACKWARD_OVERWRITE = False
 | 
					BACKWARD_OVERWRITE = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@Language.factory(
 | 
					@Language.factory(
 | 
				
			||||||
    "sentencizer",
 | 
					    "sentencizer",
 | 
				
			||||||
    assigns=["token.is_sent_start", "doc.sents"],
 | 
					    assigns=["token.is_sent_start", "doc.sents"],
 | 
				
			||||||
| 
						 | 
					@ -36,7 +36,8 @@ class Sentencizer(Pipe):
 | 
				
			||||||
    DOCS: https://spacy.io/api/sentencizer
 | 
					    DOCS: https://spacy.io/api/sentencizer
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
 | 
					    default_punct_chars = [
 | 
				
			||||||
 | 
					        '!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
 | 
				
			||||||
        '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
 | 
					        '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
 | 
				
			||||||
        '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
 | 
					        '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
 | 
				
			||||||
        '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶',
 | 
					        '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶',
 | 
				
			||||||
| 
						 | 
					@ -46,7 +47,8 @@ class Sentencizer(Pipe):
 | 
				
			||||||
        '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
 | 
					        '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
 | 
				
			||||||
        '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
 | 
					        '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
 | 
				
			||||||
        '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
 | 
					        '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
 | 
				
			||||||
            '。', '。']
 | 
					        '。', '。'
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(
 | 
					    def __init__(
 | 
				
			||||||
        self,
 | 
					        self,
 | 
				
			||||||
| 
						 | 
					@ -128,7 +130,6 @@ class Sentencizer(Pipe):
 | 
				
			||||||
        if isinstance(docs, Doc):
 | 
					        if isinstance(docs, Doc):
 | 
				
			||||||
            docs = [docs]
 | 
					            docs = [docs]
 | 
				
			||||||
        cdef Doc doc
 | 
					        cdef Doc doc
 | 
				
			||||||
        cdef int idx = 0
 | 
					 | 
				
			||||||
        for i, doc in enumerate(docs):
 | 
					        for i, doc in enumerate(docs):
 | 
				
			||||||
            doc_tag_ids = batch_tag_ids[i]
 | 
					            doc_tag_ids = batch_tag_ids[i]
 | 
				
			||||||
            for j, tag_id in enumerate(doc_tag_ids):
 | 
					            for j, tag_id in enumerate(doc_tag_ids):
 | 
				
			||||||
| 
						 | 
					@ -169,7 +170,6 @@ class Sentencizer(Pipe):
 | 
				
			||||||
        path = path.with_suffix(".json")
 | 
					        path = path.with_suffix(".json")
 | 
				
			||||||
        srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
 | 
					        srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    def from_disk(self, path, *, exclude=tuple()):
 | 
					    def from_disk(self, path, *, exclude=tuple()):
 | 
				
			||||||
        """Load the sentencizer from disk.
 | 
					        """Load the sentencizer from disk.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,7 +2,6 @@
 | 
				
			||||||
from itertools import islice
 | 
					from itertools import islice
 | 
				
			||||||
from typing import Callable, Optional
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 | 
					from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..tokens.doc cimport Doc
 | 
					from ..tokens.doc cimport Doc
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,18 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True, binding=True
 | 
					# cython: infer_types=True, profile=True, binding=True
 | 
				
			||||||
import warnings
 | 
					 | 
				
			||||||
from itertools import islice
 | 
					from itertools import islice
 | 
				
			||||||
from typing import Callable, Optional
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
 | 
					from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
 | 
				
			||||||
from thinc.types import Floats2d
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..morphology cimport Morphology
 | 
					 | 
				
			||||||
from ..tokens.doc cimport Doc
 | 
					from ..tokens.doc cimport Doc
 | 
				
			||||||
from ..vocab cimport Vocab
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..attrs import ID, POS
 | 
					from ..errors import Errors
 | 
				
			||||||
from ..errors import Errors, Warnings
 | 
					 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
from ..parts_of_speech import X
 | 
					 | 
				
			||||||
from ..scorer import Scorer
 | 
					from ..scorer import Scorer
 | 
				
			||||||
from ..training import validate_examples, validate_get_examples
 | 
					from ..training import validate_examples, validate_get_examples
 | 
				
			||||||
from ..util import registry
 | 
					from ..util import registry
 | 
				
			||||||
from .pipe import deserialize_config
 | 
					 | 
				
			||||||
from .trainable_pipe import TrainablePipe
 | 
					from .trainable_pipe import TrainablePipe
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# See #9050
 | 
					# See #9050
 | 
				
			||||||
| 
						 | 
					@ -169,7 +161,6 @@ class Tagger(TrainablePipe):
 | 
				
			||||||
        if isinstance(docs, Doc):
 | 
					        if isinstance(docs, Doc):
 | 
				
			||||||
            docs = [docs]
 | 
					            docs = [docs]
 | 
				
			||||||
        cdef Doc doc
 | 
					        cdef Doc doc
 | 
				
			||||||
        cdef Vocab vocab = self.vocab
 | 
					 | 
				
			||||||
        cdef bint overwrite = self.cfg["overwrite"]
 | 
					        cdef bint overwrite = self.cfg["overwrite"]
 | 
				
			||||||
        labels = self.labels
 | 
					        labels = self.labels
 | 
				
			||||||
        for i, doc in enumerate(docs):
 | 
					        for i, doc in enumerate(docs):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,8 +13,18 @@ cdef class Parser(TrainablePipe):
 | 
				
			||||||
    cdef readonly TransitionSystem moves
 | 
					    cdef readonly TransitionSystem moves
 | 
				
			||||||
    cdef public object _multitasks
 | 
					    cdef public object _multitasks
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef void _parseC(self, CBlas cblas, StateC** states,
 | 
					    cdef void _parseC(
 | 
				
			||||||
            WeightsC weights, SizesC sizes) nogil
 | 
					        self,
 | 
				
			||||||
 | 
					        CBlas cblas,
 | 
				
			||||||
 | 
					        StateC** states,
 | 
				
			||||||
 | 
					        WeightsC weights,
 | 
				
			||||||
 | 
					        SizesC sizes
 | 
				
			||||||
 | 
					    ) nogil
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef void c_transition_batch(self, StateC** states, const float* scores,
 | 
					    cdef void c_transition_batch(
 | 
				
			||||||
            int nr_class, int batch_size) nogil
 | 
					        self,
 | 
				
			||||||
 | 
					        StateC** states,
 | 
				
			||||||
 | 
					        const float* scores,
 | 
				
			||||||
 | 
					        int nr_class,
 | 
				
			||||||
 | 
					        int batch_size
 | 
				
			||||||
 | 
					    ) nogil
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,20 +7,15 @@ from cymem.cymem cimport Pool
 | 
				
			||||||
from itertools import islice
 | 
					from itertools import islice
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from libc.stdlib cimport calloc, free
 | 
					from libc.stdlib cimport calloc, free
 | 
				
			||||||
from libc.string cimport memcpy, memset
 | 
					from libc.string cimport memset
 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					from libcpp.vector cimport vector
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
from thinc.api import CupyOps, NumpyOps, get_ops, set_dropout_rate
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from thinc.extra.search cimport Beam
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import warnings
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
import numpy.random
 | 
					import numpy.random
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
 | 
					from thinc.api import CupyOps, NumpyOps, set_dropout_rate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..ml.parser_model cimport (
 | 
					from ..ml.parser_model cimport (
 | 
				
			||||||
    ActivationsC,
 | 
					    ActivationsC,
 | 
				
			||||||
| 
						 | 
					@ -42,7 +37,7 @@ from .trainable_pipe import TrainablePipe
 | 
				
			||||||
from ._parser_internals cimport _beam_utils
 | 
					from ._parser_internals cimport _beam_utils
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..errors import Errors, Warnings
 | 
					from ..errors import Errors
 | 
				
			||||||
from ..training import validate_examples, validate_get_examples
 | 
					from ..training import validate_examples, validate_get_examples
 | 
				
			||||||
from ._parser_internals import _beam_utils
 | 
					from ._parser_internals import _beam_utils
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -258,7 +253,6 @@ cdef class Parser(TrainablePipe):
 | 
				
			||||||
            except Exception as e:
 | 
					            except Exception as e:
 | 
				
			||||||
                error_handler(self.name, self, batch_in_order, e)
 | 
					                error_handler(self.name, self, batch_in_order, e)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    def predict(self, docs):
 | 
					    def predict(self, docs):
 | 
				
			||||||
        if isinstance(docs, Doc):
 | 
					        if isinstance(docs, Doc):
 | 
				
			||||||
            docs = [docs]
 | 
					            docs = [docs]
 | 
				
			||||||
| 
						 | 
					@ -300,8 +294,6 @@ cdef class Parser(TrainablePipe):
 | 
				
			||||||
        return batch
 | 
					        return batch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
 | 
					    def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
 | 
				
			||||||
        cdef Beam beam
 | 
					 | 
				
			||||||
        cdef Doc doc
 | 
					 | 
				
			||||||
        self._ensure_labels_are_added(docs)
 | 
					        self._ensure_labels_are_added(docs)
 | 
				
			||||||
        batch = _beam_utils.BeamBatch(
 | 
					        batch = _beam_utils.BeamBatch(
 | 
				
			||||||
            self.moves,
 | 
					            self.moves,
 | 
				
			||||||
| 
						 | 
					@ -321,16 +313,18 @@ cdef class Parser(TrainablePipe):
 | 
				
			||||||
        del model
 | 
					        del model
 | 
				
			||||||
        return list(batch)
 | 
					        return list(batch)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef void _parseC(self, CBlas cblas, StateC** states,
 | 
					    cdef void _parseC(
 | 
				
			||||||
            WeightsC weights, SizesC sizes) nogil:
 | 
					        self, CBlas cblas, StateC** states, WeightsC weights, SizesC sizes
 | 
				
			||||||
        cdef int i, j
 | 
					    ) nogil:
 | 
				
			||||||
 | 
					        cdef int i
 | 
				
			||||||
        cdef vector[StateC*] unfinished
 | 
					        cdef vector[StateC*] unfinished
 | 
				
			||||||
        cdef ActivationsC activations = alloc_activations(sizes)
 | 
					        cdef ActivationsC activations = alloc_activations(sizes)
 | 
				
			||||||
        while sizes.states >= 1:
 | 
					        while sizes.states >= 1:
 | 
				
			||||||
            predict_states(cblas, &activations, states, &weights, sizes)
 | 
					            predict_states(cblas, &activations, states, &weights, sizes)
 | 
				
			||||||
            # Validate actions, argmax, take action.
 | 
					            # Validate actions, argmax, take action.
 | 
				
			||||||
            self.c_transition_batch(states,
 | 
					            self.c_transition_batch(
 | 
				
			||||||
                activations.scores, sizes.classes, sizes.states)
 | 
					                states, activations.scores, sizes.classes, sizes.states
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
            for i in range(sizes.states):
 | 
					            for i in range(sizes.states):
 | 
				
			||||||
                if not states[i].is_final():
 | 
					                if not states[i].is_final():
 | 
				
			||||||
                    unfinished.push_back(states[i])
 | 
					                    unfinished.push_back(states[i])
 | 
				
			||||||
| 
						 | 
					@ -342,7 +336,6 @@ cdef class Parser(TrainablePipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def set_annotations(self, docs, states_or_beams):
 | 
					    def set_annotations(self, docs, states_or_beams):
 | 
				
			||||||
        cdef StateClass state
 | 
					        cdef StateClass state
 | 
				
			||||||
        cdef Beam beam
 | 
					 | 
				
			||||||
        cdef Doc doc
 | 
					        cdef Doc doc
 | 
				
			||||||
        states = _beam_utils.collect_states(states_or_beams, docs)
 | 
					        states = _beam_utils.collect_states(states_or_beams, docs)
 | 
				
			||||||
        for i, (state, doc) in enumerate(zip(states, docs)):
 | 
					        for i, (state, doc) in enumerate(zip(states, docs)):
 | 
				
			||||||
| 
						 | 
					@ -359,8 +352,13 @@ cdef class Parser(TrainablePipe):
 | 
				
			||||||
        self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
 | 
					        self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
 | 
				
			||||||
        return [state for state in states if not state.c.is_final()]
 | 
					        return [state for state in states if not state.c.is_final()]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef void c_transition_batch(self, StateC** states, const float* scores,
 | 
					    cdef void c_transition_batch(
 | 
				
			||||||
            int nr_class, int batch_size) nogil:
 | 
					        self,
 | 
				
			||||||
 | 
					        StateC** states,
 | 
				
			||||||
 | 
					        const float* scores,
 | 
				
			||||||
 | 
					        int nr_class,
 | 
				
			||||||
 | 
					        int batch_size
 | 
				
			||||||
 | 
					    ) nogil:
 | 
				
			||||||
        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
 | 
					        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
 | 
				
			||||||
        with gil:
 | 
					        with gil:
 | 
				
			||||||
            assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
 | 
					            assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
 | 
				
			||||||
| 
						 | 
					@ -380,7 +378,6 @@ cdef class Parser(TrainablePipe):
 | 
				
			||||||
        free(is_valid)
 | 
					        free(is_valid)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def update(self, examples, *, drop=0., sgd=None, losses=None):
 | 
					    def update(self, examples, *, drop=0., sgd=None, losses=None):
 | 
				
			||||||
        cdef StateClass state
 | 
					 | 
				
			||||||
        if losses is None:
 | 
					        if losses is None:
 | 
				
			||||||
            losses = {}
 | 
					            losses = {}
 | 
				
			||||||
        losses.setdefault(self.name, 0.)
 | 
					        losses.setdefault(self.name, 0.)
 | 
				
			||||||
| 
						 | 
					@ -420,7 +417,6 @@ cdef class Parser(TrainablePipe):
 | 
				
			||||||
            return losses
 | 
					            return losses
 | 
				
			||||||
        model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
 | 
					        model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        all_states = list(states)
 | 
					 | 
				
			||||||
        states_golds = list(zip(states, golds))
 | 
					        states_golds = list(zip(states, golds))
 | 
				
			||||||
        n_moves = 0
 | 
					        n_moves = 0
 | 
				
			||||||
        while states_golds:
 | 
					        while states_golds:
 | 
				
			||||||
| 
						 | 
					@ -500,8 +496,16 @@ cdef class Parser(TrainablePipe):
 | 
				
			||||||
        del tutor
 | 
					        del tutor
 | 
				
			||||||
        return losses
 | 
					        return losses
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def update_beam(self, examples, *, beam_width,
 | 
					    def update_beam(
 | 
				
			||||||
            drop=0., sgd=None, losses=None, beam_density=0.0):
 | 
					        self,
 | 
				
			||||||
 | 
					        examples,
 | 
				
			||||||
 | 
					        *,
 | 
				
			||||||
 | 
					        beam_width,
 | 
				
			||||||
 | 
					        drop=0.,
 | 
				
			||||||
 | 
					        sgd=None,
 | 
				
			||||||
 | 
					        losses=None,
 | 
				
			||||||
 | 
					        beam_density=0.0
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
        states, golds, _ = self.moves.init_gold_batch(examples)
 | 
					        states, golds, _ = self.moves.init_gold_batch(examples)
 | 
				
			||||||
        if not states:
 | 
					        if not states:
 | 
				
			||||||
            return losses
 | 
					            return losses
 | 
				
			||||||
| 
						 | 
					@ -531,8 +535,9 @@ cdef class Parser(TrainablePipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
 | 
					        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
 | 
				
			||||||
        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
 | 
					        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
 | 
				
			||||||
        cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
 | 
					        cdef np.ndarray d_scores = numpy.zeros(
 | 
				
			||||||
                                        dtype='f', order='C')
 | 
					            (len(states), self.moves.n_moves), dtype='f', order='C'
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        c_d_scores = <float*>d_scores.data
 | 
					        c_d_scores = <float*>d_scores.data
 | 
				
			||||||
        unseen_classes = self.model.attrs["unseen_classes"]
 | 
					        unseen_classes = self.model.attrs["unseen_classes"]
 | 
				
			||||||
        for i, (state, gold) in enumerate(zip(states, golds)):
 | 
					        for i, (state, gold) in enumerate(zip(states, golds)):
 | 
				
			||||||
| 
						 | 
					@ -542,8 +547,9 @@ cdef class Parser(TrainablePipe):
 | 
				
			||||||
            for j in range(self.moves.n_moves):
 | 
					            for j in range(self.moves.n_moves):
 | 
				
			||||||
                if costs[j] <= 0.0 and j in unseen_classes:
 | 
					                if costs[j] <= 0.0 and j in unseen_classes:
 | 
				
			||||||
                    unseen_classes.remove(j)
 | 
					                    unseen_classes.remove(j)
 | 
				
			||||||
            cpu_log_loss(c_d_scores,
 | 
					            cpu_log_loss(
 | 
				
			||||||
                costs, is_valid, &scores[i, 0], d_scores.shape[1])
 | 
					                c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1]
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
            c_d_scores += d_scores.shape[1]
 | 
					            c_d_scores += d_scores.shape[1]
 | 
				
			||||||
        # Note that we don't normalize this. See comment in update() for why.
 | 
					        # Note that we don't normalize this. See comment in update() for why.
 | 
				
			||||||
        if losses is not None:
 | 
					        if losses is not None:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,7 +2,6 @@
 | 
				
			||||||
cimport cython
 | 
					cimport cython
 | 
				
			||||||
from libc.stdint cimport uint32_t
 | 
					from libc.stdint cimport uint32_t
 | 
				
			||||||
from libc.string cimport memcpy
 | 
					from libc.string cimport memcpy
 | 
				
			||||||
from libcpp.set cimport set
 | 
					 | 
				
			||||||
from murmurhash.mrmr cimport hash32, hash64
 | 
					from murmurhash.mrmr cimport hash32, hash64
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
| 
						 | 
					@ -20,9 +19,10 @@ cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
        out_hash[0] = key
 | 
					        out_hash[0] = key
 | 
				
			||||||
        return True
 | 
					        return True
 | 
				
			||||||
    except:
 | 
					    except:  # no-cython-lint
 | 
				
			||||||
        return False
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_string_id(key):
 | 
					def get_string_id(key):
 | 
				
			||||||
    """Get a string ID, handling the reserved symbols correctly. If the key is
 | 
					    """Get a string ID, handling the reserved symbols correctly. If the key is
 | 
				
			||||||
    already an ID, return it.
 | 
					    already an ID, return it.
 | 
				
			||||||
| 
						 | 
					@ -87,7 +87,6 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
 | 
				
			||||||
    cdef int n_length_bytes
 | 
					    cdef int n_length_bytes
 | 
				
			||||||
    cdef int i
 | 
					    cdef int i
 | 
				
			||||||
    cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
 | 
					    cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
 | 
				
			||||||
    cdef uint32_t ulength = length
 | 
					 | 
				
			||||||
    if length < sizeof(string.s):
 | 
					    if length < sizeof(string.s):
 | 
				
			||||||
        string.s[0] = <unsigned char>length
 | 
					        string.s[0] = <unsigned char>length
 | 
				
			||||||
        memcpy(&string.s[1], chars, length)
 | 
					        memcpy(&string.s[1], chars, length)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -52,7 +52,8 @@ TEST_PATTERNS = [
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
    "pattern", [[{"XX": "y"}, {"LENGTH": "2"}, {"TEXT": {"IN": 5}}]]
 | 
					    "pattern",
 | 
				
			||||||
 | 
					    [[{"XX": "y"}], [{"LENGTH": "2"}], [{"TEXT": {"IN": 5}}], [{"text": {"in": 6}}]],
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
def test_matcher_pattern_validation(en_vocab, pattern):
 | 
					def test_matcher_pattern_validation(en_vocab, pattern):
 | 
				
			||||||
    matcher = Matcher(en_vocab, validate=True)
 | 
					    matcher = Matcher(en_vocab, validate=True)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,6 +12,7 @@ def test_build_dependencies():
 | 
				
			||||||
        "flake8",
 | 
					        "flake8",
 | 
				
			||||||
        "hypothesis",
 | 
					        "hypothesis",
 | 
				
			||||||
        "pre-commit",
 | 
					        "pre-commit",
 | 
				
			||||||
 | 
					        "cython-lint",
 | 
				
			||||||
        "black",
 | 
					        "black",
 | 
				
			||||||
        "isort",
 | 
					        "isort",
 | 
				
			||||||
        "mypy",
 | 
					        "mypy",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -377,3 +377,22 @@ def test_displacy_manual_sorted_entities():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    html = displacy.render(doc, style="ent", manual=True)
 | 
					    html = displacy.render(doc, style="ent", manual=True)
 | 
				
			||||||
    assert html.find("FIRST") < html.find("SECOND")
 | 
					    assert html.find("FIRST") < html.find("SECOND")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.issue(12816)
 | 
				
			||||||
 | 
					def test_issue12816(en_vocab) -> None:
 | 
				
			||||||
 | 
					    """Test that displaCy's span visualizer escapes annotated HTML tags correctly."""
 | 
				
			||||||
 | 
					    # Create a doc containing an annotated word and an unannotated HTML tag
 | 
				
			||||||
 | 
					    doc = Doc(en_vocab, words=["test", "<TEST>"])
 | 
				
			||||||
 | 
					    doc.spans["sc"] = [Span(doc, 0, 1, label="test")]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Verify that the HTML tag is escaped when unannotated
 | 
				
			||||||
 | 
					    html = displacy.render(doc, style="span")
 | 
				
			||||||
 | 
					    assert "<TEST>" in html
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Annotate the HTML tag
 | 
				
			||||||
 | 
					    doc.spans["sc"].append(Span(doc, 1, 2, label="test"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Verify that the HTML tag is still escaped
 | 
				
			||||||
 | 
					    html = displacy.render(doc, style="span")
 | 
				
			||||||
 | 
					    assert "<TEST>" in html
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -31,24 +31,58 @@ cdef class Tokenizer:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
 | 
					    cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
 | 
				
			||||||
    cdef int _apply_special_cases(self, Doc doc) except -1
 | 
					    cdef int _apply_special_cases(self, Doc doc) except -1
 | 
				
			||||||
    cdef void _filter_special_spans(self, vector[SpanC] &original,
 | 
					    cdef void _filter_special_spans(
 | 
				
			||||||
                            vector[SpanC] &filtered, int doc_len) nogil
 | 
					        self,
 | 
				
			||||||
    cdef object _prepare_special_spans(self, Doc doc,
 | 
					        vector[SpanC] &original,
 | 
				
			||||||
                                       vector[SpanC] &filtered)
 | 
					        vector[SpanC] &filtered,
 | 
				
			||||||
    cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
 | 
					        int doc_len,
 | 
				
			||||||
                                       object span_data)
 | 
					    ) nogil
 | 
				
			||||||
    cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
 | 
					    cdef object _prepare_special_spans(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        Doc doc,
 | 
				
			||||||
 | 
					        vector[SpanC] &filtered,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    cdef int _retokenize_special_spans(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        Doc doc,
 | 
				
			||||||
 | 
					        TokenC* tokens,
 | 
				
			||||||
 | 
					        object span_data,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    cdef int _try_specials_and_cache(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        hash_t key,
 | 
				
			||||||
 | 
					        Doc tokens,
 | 
				
			||||||
        int* has_special,
 | 
					        int* has_special,
 | 
				
			||||||
                                     bint with_special_cases) except -1
 | 
					        bint with_special_cases,
 | 
				
			||||||
    cdef int _tokenize(self, Doc tokens, str span, hash_t key,
 | 
					    ) except -1
 | 
				
			||||||
                       int* has_special, bint with_special_cases) except -1
 | 
					    cdef int _tokenize(
 | 
				
			||||||
    cdef str _split_affixes(self, Pool mem, str string,
 | 
					        self,
 | 
				
			||||||
 | 
					        Doc tokens,
 | 
				
			||||||
 | 
					        str span,
 | 
				
			||||||
 | 
					        hash_t key,
 | 
				
			||||||
 | 
					        int* has_special,
 | 
				
			||||||
 | 
					        bint with_special_cases,
 | 
				
			||||||
 | 
					    ) except -1
 | 
				
			||||||
 | 
					    cdef str _split_affixes(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        Pool mem,
 | 
				
			||||||
 | 
					        str string,
 | 
				
			||||||
        vector[LexemeC*] *prefixes,
 | 
					        vector[LexemeC*] *prefixes,
 | 
				
			||||||
        vector[LexemeC*] *suffixes, int* has_special,
 | 
					        vector[LexemeC*] *suffixes, int* has_special,
 | 
				
			||||||
                                bint with_special_cases)
 | 
					        bint with_special_cases,
 | 
				
			||||||
    cdef int _attach_tokens(self, Doc tokens, str string,
 | 
					    )
 | 
				
			||||||
 | 
					    cdef int _attach_tokens(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        Doc tokens,
 | 
				
			||||||
 | 
					        str string,
 | 
				
			||||||
        vector[LexemeC*] *prefixes,
 | 
					        vector[LexemeC*] *prefixes,
 | 
				
			||||||
        vector[LexemeC*] *suffixes, int* has_special,
 | 
					        vector[LexemeC*] *suffixes, int* has_special,
 | 
				
			||||||
                            bint with_special_cases) except -1
 | 
					        bint with_special_cases,
 | 
				
			||||||
    cdef int _save_cached(self, const TokenC* tokens, hash_t key,
 | 
					    ) except -1
 | 
				
			||||||
                          int* has_special, int n) except -1
 | 
					    cdef int _save_cached(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        const TokenC* tokens,
 | 
				
			||||||
 | 
					        hash_t key,
 | 
				
			||||||
 | 
					        int* has_special,
 | 
				
			||||||
 | 
					        int n,
 | 
				
			||||||
 | 
					    ) except -1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,20 +8,18 @@ from libcpp.set cimport set as stdset
 | 
				
			||||||
from preshed.maps cimport PreshMap
 | 
					from preshed.maps cimport PreshMap
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import warnings
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .lexeme cimport EMPTY_LEXEME
 | 
					from .lexeme cimport EMPTY_LEXEME
 | 
				
			||||||
from .strings cimport hash_string
 | 
					from .strings cimport hash_string
 | 
				
			||||||
from .tokens.doc cimport Doc
 | 
					from .tokens.doc cimport Doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from . import util
 | 
					from . import util
 | 
				
			||||||
from .attrs import intify_attrs
 | 
					from .attrs import intify_attrs
 | 
				
			||||||
from .errors import Errors, Warnings
 | 
					from .errors import Errors
 | 
				
			||||||
from .scorer import Scorer
 | 
					from .scorer import Scorer
 | 
				
			||||||
from .symbols import NORM, ORTH
 | 
					from .symbols import NORM, ORTH
 | 
				
			||||||
from .tokens import Span
 | 
					from .tokens import Span
 | 
				
			||||||
from .training import validate_examples
 | 
					from .training import validate_examples
 | 
				
			||||||
from .util import get_words_and_spaces, registry
 | 
					from .util import get_words_and_spaces
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Tokenizer:
 | 
					cdef class Tokenizer:
 | 
				
			||||||
| 
						 | 
					@ -324,7 +322,7 @@ cdef class Tokenizer:
 | 
				
			||||||
        cdef int span_start
 | 
					        cdef int span_start
 | 
				
			||||||
        cdef int span_end
 | 
					        cdef int span_end
 | 
				
			||||||
        while i < doc.length:
 | 
					        while i < doc.length:
 | 
				
			||||||
            if not i in span_data:
 | 
					            if i not in span_data:
 | 
				
			||||||
                tokens[i + offset] = doc.c[i]
 | 
					                tokens[i + offset] = doc.c[i]
 | 
				
			||||||
                i += 1
 | 
					                i += 1
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
| 
						 | 
					@ -395,12 +393,15 @@ cdef class Tokenizer:
 | 
				
			||||||
        self._save_cached(&tokens.c[orig_size], orig_key, has_special,
 | 
					        self._save_cached(&tokens.c[orig_size], orig_key, has_special,
 | 
				
			||||||
                          tokens.length - orig_size)
 | 
					                          tokens.length - orig_size)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef str _split_affixes(self, Pool mem, str string,
 | 
					    cdef str _split_affixes(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        Pool mem,
 | 
				
			||||||
 | 
					        str string,
 | 
				
			||||||
        vector[const LexemeC*] *prefixes,
 | 
					        vector[const LexemeC*] *prefixes,
 | 
				
			||||||
        vector[const LexemeC*] *suffixes,
 | 
					        vector[const LexemeC*] *suffixes,
 | 
				
			||||||
        int* has_special,
 | 
					        int* has_special,
 | 
				
			||||||
                                bint with_special_cases):
 | 
					        bint with_special_cases
 | 
				
			||||||
        cdef size_t i
 | 
					    ):
 | 
				
			||||||
        cdef str prefix
 | 
					        cdef str prefix
 | 
				
			||||||
        cdef str suffix
 | 
					        cdef str suffix
 | 
				
			||||||
        cdef str minus_pre
 | 
					        cdef str minus_pre
 | 
				
			||||||
| 
						 | 
					@ -445,10 +446,6 @@ cdef class Tokenizer:
 | 
				
			||||||
                            vector[const LexemeC*] *suffixes,
 | 
					                            vector[const LexemeC*] *suffixes,
 | 
				
			||||||
                            int* has_special,
 | 
					                            int* has_special,
 | 
				
			||||||
                            bint with_special_cases) except -1:
 | 
					                            bint with_special_cases) except -1:
 | 
				
			||||||
        cdef bint specials_hit = 0
 | 
					 | 
				
			||||||
        cdef bint cache_hit = 0
 | 
					 | 
				
			||||||
        cdef int split, end
 | 
					 | 
				
			||||||
        cdef const LexemeC* const* lexemes
 | 
					 | 
				
			||||||
        cdef const LexemeC* lexeme
 | 
					        cdef const LexemeC* lexeme
 | 
				
			||||||
        cdef str span
 | 
					        cdef str span
 | 
				
			||||||
        cdef int i
 | 
					        cdef int i
 | 
				
			||||||
| 
						 | 
					@ -458,9 +455,11 @@ cdef class Tokenizer:
 | 
				
			||||||
        if string:
 | 
					        if string:
 | 
				
			||||||
            if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
 | 
					            if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
 | 
				
			||||||
                pass
 | 
					                pass
 | 
				
			||||||
            elif (self.token_match and self.token_match(string)) or \
 | 
					            elif (
 | 
				
			||||||
                    (self.url_match and \
 | 
					                (self.token_match and self.token_match(string)) or
 | 
				
			||||||
                    self.url_match(string)):
 | 
					                (self.url_match and self.url_match(string))
 | 
				
			||||||
 | 
					            ):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                # We're always saying 'no' to spaces here -- the caller will
 | 
					                # We're always saying 'no' to spaces here -- the caller will
 | 
				
			||||||
                # fix up the outermost one, with reference to the original.
 | 
					                # fix up the outermost one, with reference to the original.
 | 
				
			||||||
                # See Issue #859
 | 
					                # See Issue #859
 | 
				
			||||||
| 
						 | 
					@ -821,7 +820,7 @@ cdef class Tokenizer:
 | 
				
			||||||
        self.infix_finditer = None
 | 
					        self.infix_finditer = None
 | 
				
			||||||
        self.token_match = None
 | 
					        self.token_match = None
 | 
				
			||||||
        self.url_match = None
 | 
					        self.url_match = None
 | 
				
			||||||
        msg = util.from_bytes(bytes_data, deserializers, exclude)
 | 
					        util.from_bytes(bytes_data, deserializers, exclude)
 | 
				
			||||||
        if "prefix_search" in data and isinstance(data["prefix_search"], str):
 | 
					        if "prefix_search" in data and isinstance(data["prefix_search"], str):
 | 
				
			||||||
            self.prefix_search = re.compile(data["prefix_search"]).search
 | 
					            self.prefix_search = re.compile(data["prefix_search"]).search
 | 
				
			||||||
        if "suffix_search" in data and isinstance(data["suffix_search"], str):
 | 
					        if "suffix_search" in data and isinstance(data["suffix_search"], str):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,6 @@
 | 
				
			||||||
# cython: infer_types=True, bounds_check=False, profile=True
 | 
					# cython: infer_types=True, bounds_check=False, profile=True
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
from libc.stdlib cimport free, malloc
 | 
					from libc.string cimport memset
 | 
				
			||||||
from libc.string cimport memcpy, memset
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
from thinc.api import get_array_module
 | 
					from thinc.api import get_array_module
 | 
				
			||||||
| 
						 | 
					@ -10,7 +9,7 @@ from ..attrs cimport MORPH, NORM
 | 
				
			||||||
from ..lexeme cimport EMPTY_LEXEME, Lexeme
 | 
					from ..lexeme cimport EMPTY_LEXEME, Lexeme
 | 
				
			||||||
from ..structs cimport LexemeC, TokenC
 | 
					from ..structs cimport LexemeC, TokenC
 | 
				
			||||||
from ..vocab cimport Vocab
 | 
					from ..vocab cimport Vocab
 | 
				
			||||||
from .doc cimport Doc, set_children_from_heads, token_by_end, token_by_start
 | 
					from .doc cimport Doc, set_children_from_heads, token_by_start
 | 
				
			||||||
from .span cimport Span
 | 
					from .span cimport Span
 | 
				
			||||||
from .token cimport Token
 | 
					from .token cimport Token
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -147,7 +146,7 @@ def _merge(Doc doc, merges):
 | 
				
			||||||
        syntactic root of the span.
 | 
					        syntactic root of the span.
 | 
				
			||||||
    RETURNS (Token): The first newly merged token.
 | 
					    RETURNS (Token): The first newly merged token.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    cdef int i, merge_index, start, end, token_index, current_span_index, current_offset, offset, span_index
 | 
					    cdef int i, merge_index, start, token_index, current_span_index, current_offset, offset, span_index
 | 
				
			||||||
    cdef Span span
 | 
					    cdef Span span
 | 
				
			||||||
    cdef const LexemeC* lex
 | 
					    cdef const LexemeC* lex
 | 
				
			||||||
    cdef TokenC* token
 | 
					    cdef TokenC* token
 | 
				
			||||||
| 
						 | 
					@ -165,7 +164,6 @@ def _merge(Doc doc, merges):
 | 
				
			||||||
    merges.sort(key=_get_start)
 | 
					    merges.sort(key=_get_start)
 | 
				
			||||||
    for merge_index, (span, attributes) in enumerate(merges):
 | 
					    for merge_index, (span, attributes) in enumerate(merges):
 | 
				
			||||||
        start = span.start
 | 
					        start = span.start
 | 
				
			||||||
        end = span.end
 | 
					 | 
				
			||||||
        spans.append(span)
 | 
					        spans.append(span)
 | 
				
			||||||
        # House the new merged token where it starts
 | 
					        # House the new merged token where it starts
 | 
				
			||||||
        token = &doc.c[start]
 | 
					        token = &doc.c[start]
 | 
				
			||||||
| 
						 | 
					@ -203,8 +201,9 @@ def _merge(Doc doc, merges):
 | 
				
			||||||
    # for the merged region. To do this, we create a boolean array indicating
 | 
					    # for the merged region. To do this, we create a boolean array indicating
 | 
				
			||||||
    # whether the row is to be deleted, then use numpy.delete
 | 
					    # whether the row is to be deleted, then use numpy.delete
 | 
				
			||||||
    if doc.tensor is not None and doc.tensor.size != 0:
 | 
					    if doc.tensor is not None and doc.tensor.size != 0:
 | 
				
			||||||
        doc.tensor = _resize_tensor(doc.tensor,
 | 
					        doc.tensor = _resize_tensor(
 | 
				
			||||||
            [(m[0].start, m[0].end) for m in merges])
 | 
					            doc.tensor, [(m[0].start, m[0].end) for m in merges]
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
    # Memorize span roots and sets dependencies of the newly merged
 | 
					    # Memorize span roots and sets dependencies of the newly merged
 | 
				
			||||||
    # tokens to the dependencies of their roots.
 | 
					    # tokens to the dependencies of their roots.
 | 
				
			||||||
    span_roots = []
 | 
					    span_roots = []
 | 
				
			||||||
| 
						 | 
					@ -345,7 +344,11 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
 | 
				
			||||||
    if to_process_tensor:
 | 
					    if to_process_tensor:
 | 
				
			||||||
        xp = get_array_module(doc.tensor)
 | 
					        xp = get_array_module(doc.tensor)
 | 
				
			||||||
        if xp is numpy:
 | 
					        if xp is numpy:
 | 
				
			||||||
            doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0)
 | 
					            doc.tensor = xp.append(
 | 
				
			||||||
 | 
					                doc.tensor,
 | 
				
			||||||
 | 
					                xp.zeros((nb_subtokens, doc.tensor.shape[1]), dtype="float32"),
 | 
				
			||||||
 | 
					                axis=0
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            shape = (doc.tensor.shape[0] + nb_subtokens, doc.tensor.shape[1])
 | 
					            shape = (doc.tensor.shape[0] + nb_subtokens, doc.tensor.shape[1])
 | 
				
			||||||
            resized_array = xp.zeros(shape, dtype="float32")
 | 
					            resized_array = xp.zeros(shape, dtype="float32")
 | 
				
			||||||
| 
						 | 
					@ -367,7 +370,8 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
 | 
				
			||||||
        token.norm = 0  # reset norm
 | 
					        token.norm = 0  # reset norm
 | 
				
			||||||
        if to_process_tensor:
 | 
					        if to_process_tensor:
 | 
				
			||||||
            # setting the tensors of the split tokens to array of zeros
 | 
					            # setting the tensors of the split tokens to array of zeros
 | 
				
			||||||
            doc.tensor[token_index + i:token_index + i + 1] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32")
 | 
					            doc.tensor[token_index + i:token_index + i + 1] = \
 | 
				
			||||||
 | 
					                xp.zeros((1, doc.tensor.shape[1]), dtype="float32")
 | 
				
			||||||
        # Update the character offset of the subtokens
 | 
					        # Update the character offset of the subtokens
 | 
				
			||||||
        if i != 0:
 | 
					        if i != 0:
 | 
				
			||||||
            token.idx = orig_token.idx + idx_offset
 | 
					            token.idx = orig_token.idx + idx_offset
 | 
				
			||||||
| 
						 | 
					@ -455,7 +459,6 @@ def normalize_token_attrs(Vocab vocab, attrs):
 | 
				
			||||||
def set_token_attrs(Token py_token, attrs):
 | 
					def set_token_attrs(Token py_token, attrs):
 | 
				
			||||||
    cdef TokenC* token = py_token.c
 | 
					    cdef TokenC* token = py_token.c
 | 
				
			||||||
    cdef const LexemeC* lex = token.lex
 | 
					    cdef const LexemeC* lex = token.lex
 | 
				
			||||||
    cdef Doc doc = py_token.doc
 | 
					 | 
				
			||||||
    # Assign attributes
 | 
					    # Assign attributes
 | 
				
			||||||
    for attr_name, attr_value in attrs.items():
 | 
					    for attr_name, attr_value in attrs.items():
 | 
				
			||||||
        if attr_name == "_":  # Set extension attributes
 | 
					        if attr_name == "_":  # Set extension attributes
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -61,7 +61,6 @@ cdef class Doc:
 | 
				
			||||||
    cdef int length
 | 
					    cdef int length
 | 
				
			||||||
    cdef int max_length
 | 
					    cdef int max_length
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef public object noun_chunks_iterator
 | 
					    cdef public object noun_chunks_iterator
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef object __weakref__
 | 
					    cdef object __weakref__
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -43,14 +43,13 @@ from ..attrs cimport (
 | 
				
			||||||
    attr_id_t,
 | 
					    attr_id_t,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
from ..lexeme cimport EMPTY_LEXEME, Lexeme
 | 
					from ..lexeme cimport EMPTY_LEXEME, Lexeme
 | 
				
			||||||
from ..typedefs cimport attr_t, flags_t
 | 
					from ..typedefs cimport attr_t
 | 
				
			||||||
from .token cimport Token
 | 
					from .token cimport Token
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import parts_of_speech, schemas, util
 | 
					from .. import parts_of_speech, schemas, util
 | 
				
			||||||
from ..attrs import IDS, intify_attr
 | 
					from ..attrs import IDS, intify_attr
 | 
				
			||||||
from ..compat import copy_reg, pickle
 | 
					from ..compat import copy_reg
 | 
				
			||||||
from ..errors import Errors, Warnings
 | 
					from ..errors import Errors, Warnings
 | 
				
			||||||
from ..morphology import Morphology
 | 
					 | 
				
			||||||
from ..util import get_words_and_spaces
 | 
					from ..util import get_words_and_spaces
 | 
				
			||||||
from ._retokenize import Retokenizer
 | 
					from ._retokenize import Retokenizer
 | 
				
			||||||
from .underscore import Underscore, get_ext_args
 | 
					from .underscore import Underscore, get_ext_args
 | 
				
			||||||
| 
						 | 
					@ -784,7 +783,7 @@ cdef class Doc:
 | 
				
			||||||
            # TODO:
 | 
					            # TODO:
 | 
				
			||||||
            # 1. Test basic data-driven ORTH gazetteer
 | 
					            # 1. Test basic data-driven ORTH gazetteer
 | 
				
			||||||
            # 2. Test more nuanced date and currency regex
 | 
					            # 2. Test more nuanced date and currency regex
 | 
				
			||||||
            cdef attr_t entity_type, kb_id, ent_id
 | 
					            cdef attr_t kb_id, ent_id
 | 
				
			||||||
            cdef int ent_start, ent_end
 | 
					            cdef int ent_start, ent_end
 | 
				
			||||||
            ent_spans = []
 | 
					            ent_spans = []
 | 
				
			||||||
            for ent_info in ents:
 | 
					            for ent_info in ents:
 | 
				
			||||||
| 
						 | 
					@ -987,7 +986,6 @@ cdef class Doc:
 | 
				
			||||||
            >>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
 | 
					            >>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        cdef int i, j
 | 
					        cdef int i, j
 | 
				
			||||||
        cdef attr_id_t feature
 | 
					 | 
				
			||||||
        cdef np.ndarray[attr_t, ndim=2] output
 | 
					        cdef np.ndarray[attr_t, ndim=2] output
 | 
				
			||||||
        # Handle scalar/list inputs of strings/ints for py_attr_ids
 | 
					        # Handle scalar/list inputs of strings/ints for py_attr_ids
 | 
				
			||||||
        # See also #3064
 | 
					        # See also #3064
 | 
				
			||||||
| 
						 | 
					@ -999,8 +997,10 @@ cdef class Doc:
 | 
				
			||||||
            py_attr_ids = [py_attr_ids]
 | 
					            py_attr_ids = [py_attr_ids]
 | 
				
			||||||
        # Allow strings, e.g. 'lemma' or 'LEMMA'
 | 
					        # Allow strings, e.g. 'lemma' or 'LEMMA'
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
 | 
					            py_attr_ids = [
 | 
				
			||||||
                       for id_ in py_attr_ids]
 | 
					                (IDS[id_.upper()] if hasattr(id_, "upper") else id_)
 | 
				
			||||||
 | 
					                for id_ in py_attr_ids
 | 
				
			||||||
 | 
					            ]
 | 
				
			||||||
        except KeyError as msg:
 | 
					        except KeyError as msg:
 | 
				
			||||||
            keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
 | 
					            keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
 | 
				
			||||||
            raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
 | 
					            raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
 | 
				
			||||||
| 
						 | 
					@ -1030,8 +1030,6 @@ cdef class Doc:
 | 
				
			||||||
        DOCS: https://spacy.io/api/doc#count_by
 | 
					        DOCS: https://spacy.io/api/doc#count_by
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        cdef int i
 | 
					        cdef int i
 | 
				
			||||||
        cdef attr_t attr
 | 
					 | 
				
			||||||
        cdef size_t count
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if counts is None:
 | 
					        if counts is None:
 | 
				
			||||||
            counts = Counter()
 | 
					            counts = Counter()
 | 
				
			||||||
| 
						 | 
					@ -1093,7 +1091,6 @@ cdef class Doc:
 | 
				
			||||||
        cdef int i, col
 | 
					        cdef int i, col
 | 
				
			||||||
        cdef int32_t abs_head_index
 | 
					        cdef int32_t abs_head_index
 | 
				
			||||||
        cdef attr_id_t attr_id
 | 
					        cdef attr_id_t attr_id
 | 
				
			||||||
        cdef TokenC* tokens = self.c
 | 
					 | 
				
			||||||
        cdef int length = len(array)
 | 
					        cdef int length = len(array)
 | 
				
			||||||
        if length != len(self):
 | 
					        if length != len(self):
 | 
				
			||||||
            raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
 | 
					            raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
 | 
				
			||||||
| 
						 | 
					@ -1508,7 +1505,6 @@ cdef class Doc:
 | 
				
			||||||
            attributes are inherited from the syntactic root of the span.
 | 
					            attributes are inherited from the syntactic root of the span.
 | 
				
			||||||
        RETURNS (Token): The first newly merged token.
 | 
					        RETURNS (Token): The first newly merged token.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        cdef str tag, lemma, ent_type
 | 
					 | 
				
			||||||
        attr_len = len(attributes)
 | 
					        attr_len = len(attributes)
 | 
				
			||||||
        span_len = len(spans)
 | 
					        span_len = len(spans)
 | 
				
			||||||
        if not attr_len == span_len:
 | 
					        if not attr_len == span_len:
 | 
				
			||||||
| 
						 | 
					@ -1624,7 +1620,6 @@ cdef class Doc:
 | 
				
			||||||
                for token in char_span[1:]:
 | 
					                for token in char_span[1:]:
 | 
				
			||||||
                    token.is_sent_start = False
 | 
					                    token.is_sent_start = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
        for span_group in doc_json.get("spans", {}):
 | 
					        for span_group in doc_json.get("spans", {}):
 | 
				
			||||||
            spans = []
 | 
					            spans = []
 | 
				
			||||||
            for span in doc_json["spans"][span_group]:
 | 
					            for span in doc_json["spans"][span_group]:
 | 
				
			||||||
| 
						 | 
					@ -1769,7 +1764,6 @@ cdef class Doc:
 | 
				
			||||||
        output.fill(255)
 | 
					        output.fill(255)
 | 
				
			||||||
        cdef int i, j, start_idx, end_idx
 | 
					        cdef int i, j, start_idx, end_idx
 | 
				
			||||||
        cdef bytes byte_string
 | 
					        cdef bytes byte_string
 | 
				
			||||||
        cdef unsigned char utf8_char
 | 
					 | 
				
			||||||
        for i, byte_string in enumerate(byte_strings):
 | 
					        for i, byte_string in enumerate(byte_strings):
 | 
				
			||||||
            j = 0
 | 
					            j = 0
 | 
				
			||||||
            start_idx = 0
 | 
					            start_idx = 0
 | 
				
			||||||
| 
						 | 
					@ -1822,8 +1816,6 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
 | 
					cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
 | 
				
			||||||
    # note: end is exclusive
 | 
					    # note: end is exclusive
 | 
				
			||||||
    cdef TokenC* head
 | 
					 | 
				
			||||||
    cdef TokenC* child
 | 
					 | 
				
			||||||
    cdef int i
 | 
					    cdef int i
 | 
				
			||||||
    # Set number of left/right children to 0. We'll increment it in the loops.
 | 
					    # Set number of left/right children to 0. We'll increment it in the loops.
 | 
				
			||||||
    for i in range(start, end):
 | 
					    for i in range(start, end):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,7 +3,7 @@ from typing import Generator, List, Tuple
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cimport cython
 | 
					cimport cython
 | 
				
			||||||
from cython.operator cimport dereference
 | 
					from cython.operator cimport dereference
 | 
				
			||||||
from libc.stdint cimport int32_t, int64_t
 | 
					from libc.stdint cimport int32_t
 | 
				
			||||||
from libcpp.pair cimport pair
 | 
					from libcpp.pair cimport pair
 | 
				
			||||||
from libcpp.unordered_map cimport unordered_map
 | 
					from libcpp.unordered_map cimport unordered_map
 | 
				
			||||||
from libcpp.unordered_set cimport unordered_set
 | 
					from libcpp.unordered_set cimport unordered_set
 | 
				
			||||||
| 
						 | 
					@ -11,7 +11,6 @@ from libcpp.unordered_set cimport unordered_set
 | 
				
			||||||
import weakref
 | 
					import weakref
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from murmurhash.mrmr cimport hash64
 | 
					from murmurhash.mrmr cimport hash64
 | 
				
			||||||
from preshed.maps cimport map_get_unless_missing
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import Errors
 | 
					from .. import Errors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -372,7 +371,9 @@ cdef class Graph:
 | 
				
			||||||
        >>> assert graph.has_node((0,))
 | 
					        >>> assert graph.has_node((0,))
 | 
				
			||||||
        >>> assert graph.has_edge((0,), (1,3), label="agent")
 | 
					        >>> assert graph.has_edge((0,), (1,3), label="agent")
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    def __init__(self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None):
 | 
					    def __init__(
 | 
				
			||||||
 | 
					        self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None  # no-cython-lint
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
        """Create a Graph object.
 | 
					        """Create a Graph object.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        doc (Doc): The Doc object the graph will refer to.
 | 
					        doc (Doc): The Doc object the graph will refer to.
 | 
				
			||||||
| 
						 | 
					@ -443,8 +444,6 @@ cdef class Graph:
 | 
				
			||||||
        be returned, and no new edge will be created. The weight of the edge
 | 
					        be returned, and no new edge will be created. The weight of the edge
 | 
				
			||||||
        will be updated if a weight is specified.
 | 
					        will be updated if a weight is specified.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        label_hash = self.doc.vocab.strings.as_int(label)
 | 
					 | 
				
			||||||
        weight_float = weight if weight is not None else 0.0
 | 
					 | 
				
			||||||
        edge_index = add_edge(
 | 
					        edge_index = add_edge(
 | 
				
			||||||
            &self.c,
 | 
					            &self.c,
 | 
				
			||||||
            EdgeC(
 | 
					            EdgeC(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -89,4 +89,3 @@ cdef class MorphAnalysis:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __repr__(self):
 | 
					    def __repr__(self):
 | 
				
			||||||
        return self.to_json()
 | 
					        return self.to_json()
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,4 @@
 | 
				
			||||||
cimport numpy as np
 | 
					cimport numpy as np
 | 
				
			||||||
from libc.math cimport sqrt
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import copy
 | 
					import copy
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
| 
						 | 
					@ -10,11 +9,10 @@ from thinc.api import get_array_module
 | 
				
			||||||
from ..attrs cimport *
 | 
					from ..attrs cimport *
 | 
				
			||||||
from ..attrs cimport ORTH, attr_id_t
 | 
					from ..attrs cimport ORTH, attr_id_t
 | 
				
			||||||
from ..lexeme cimport Lexeme
 | 
					from ..lexeme cimport Lexeme
 | 
				
			||||||
from ..parts_of_speech cimport univ_pos_t
 | 
					from ..structs cimport TokenC
 | 
				
			||||||
from ..structs cimport LexemeC, TokenC
 | 
					 | 
				
			||||||
from ..symbols cimport dep
 | 
					from ..symbols cimport dep
 | 
				
			||||||
from ..typedefs cimport attr_t, flags_t, hash_t
 | 
					from ..typedefs cimport attr_t, hash_t
 | 
				
			||||||
from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
 | 
					from .doc cimport _get_lca_matrix, get_token_attr
 | 
				
			||||||
from .token cimport Token
 | 
					from .token cimport Token
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..errors import Errors, Warnings
 | 
					from ..errors import Errors, Warnings
 | 
				
			||||||
| 
						 | 
					@ -595,7 +593,6 @@ cdef class Span:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        return "".join([t.text_with_ws for t in self])
 | 
					        return "".join([t.text_with_ws for t in self])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def noun_chunks(self):
 | 
					    def noun_chunks(self):
 | 
				
			||||||
        """Iterate over the base noun phrases in the span. Yields base
 | 
					        """Iterate over the base noun phrases in the span. Yields base
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,7 @@
 | 
				
			||||||
import struct
 | 
					import struct
 | 
				
			||||||
import weakref
 | 
					import weakref
 | 
				
			||||||
from copy import deepcopy
 | 
					from copy import deepcopy
 | 
				
			||||||
from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Union
 | 
					from typing import Iterable, Optional, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -34,7 +34,7 @@ cdef class SpanGroup:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DOCS: https://spacy.io/api/spangroup
 | 
					    DOCS: https://spacy.io/api/spangroup
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    def __init__(self, doc, *, name="", attrs={}, spans=[]):
 | 
					    def __init__(self, doc, *, name="", attrs={}, spans=[]):  # no-cython-lint
 | 
				
			||||||
        """Create a SpanGroup.
 | 
					        """Create a SpanGroup.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        doc (Doc): The reference Doc object.
 | 
					        doc (Doc): The reference Doc object.
 | 
				
			||||||
| 
						 | 
					@ -311,7 +311,7 @@ cdef class SpanGroup:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            other_attrs = deepcopy(other_group.attrs)
 | 
					            other_attrs = deepcopy(other_group.attrs)
 | 
				
			||||||
            span_group.attrs.update({
 | 
					            span_group.attrs.update({
 | 
				
			||||||
                key: value for key, value in other_attrs.items() \
 | 
					                key: value for key, value in other_attrs.items()
 | 
				
			||||||
                if key not in span_group.attrs
 | 
					                if key not in span_group.attrs
 | 
				
			||||||
            })
 | 
					            })
 | 
				
			||||||
            if len(other_group):
 | 
					            if len(other_group):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -98,12 +98,10 @@ cdef class Token:
 | 
				
			||||||
        elif feat_name == SENT_START:
 | 
					        elif feat_name == SENT_START:
 | 
				
			||||||
            token.sent_start = value
 | 
					            token.sent_start = value
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef inline int missing_dep(const TokenC* token) nogil:
 | 
					    cdef inline int missing_dep(const TokenC* token) nogil:
 | 
				
			||||||
        return token.dep == MISSING_DEP
 | 
					        return token.dep == MISSING_DEP
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef inline int missing_head(const TokenC* token) nogil:
 | 
					    cdef inline int missing_head(const TokenC* token) nogil:
 | 
				
			||||||
        return Token.missing_dep(token)
 | 
					        return Token.missing_dep(token)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +1,11 @@
 | 
				
			||||||
# cython: infer_types=True
 | 
					# cython: infer_types=True
 | 
				
			||||||
# Compiler crashes on memory view coercion without this. Should report bug.
 | 
					# Compiler crashes on memory view coercion without this. Should report bug.
 | 
				
			||||||
cimport numpy as np
 | 
					cimport numpy as np
 | 
				
			||||||
from cython.view cimport array as cvarray
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
np.import_array()
 | 
					np.import_array()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy
 | 
					 | 
				
			||||||
from thinc.api import get_array_module
 | 
					from thinc.api import get_array_module
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..attrs cimport (
 | 
					from ..attrs cimport (
 | 
				
			||||||
| 
						 | 
					@ -545,9 +543,9 @@ cdef class Token:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            if self.i + 1 == len(self.doc):
 | 
					            if self.i + 1 == len(self.doc):
 | 
				
			||||||
                return True
 | 
					                return True
 | 
				
			||||||
            elif self.doc[self.i+1].is_sent_start == None:
 | 
					            elif self.doc[self.i+1].is_sent_start is None:
 | 
				
			||||||
                return None
 | 
					                return None
 | 
				
			||||||
            elif self.doc[self.i+1].is_sent_start == True:
 | 
					            elif self.doc[self.i+1].is_sent_start is True:
 | 
				
			||||||
                return True
 | 
					                return True
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                return False
 | 
					                return False
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -37,10 +37,14 @@ def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[Li
 | 
				
			||||||
            b2a.append(set())
 | 
					            b2a.append(set())
 | 
				
			||||||
        # Process the alignment at the current position
 | 
					        # Process the alignment at the current position
 | 
				
			||||||
        if A[token_idx_a] == B[token_idx_b] and \
 | 
					        if A[token_idx_a] == B[token_idx_b] and \
 | 
				
			||||||
                (char_idx_a == 0 or \
 | 
					                (
 | 
				
			||||||
                    char_to_token_a[char_idx_a - 1] < token_idx_a) and \
 | 
					                    char_idx_a == 0 or
 | 
				
			||||||
                (char_idx_b == 0 or \
 | 
					                    char_to_token_a[char_idx_a - 1] < token_idx_a
 | 
				
			||||||
                    char_to_token_b[char_idx_b - 1] < token_idx_b):
 | 
					                ) and \
 | 
				
			||||||
 | 
					                (
 | 
				
			||||||
 | 
					                    char_idx_b == 0 or
 | 
				
			||||||
 | 
					                    char_to_token_b[char_idx_b - 1] < token_idx_b
 | 
				
			||||||
 | 
					                ):
 | 
				
			||||||
            # Current tokens are identical and both character offsets are the
 | 
					            # Current tokens are identical and both character offsets are the
 | 
				
			||||||
            # start of a token (either at the beginning of the document or the
 | 
					            # start of a token (either at the beginning of the document or the
 | 
				
			||||||
            # previous character belongs to a different token)
 | 
					            # previous character belongs to a different token)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,3 @@
 | 
				
			||||||
import warnings
 | 
					 | 
				
			||||||
from collections.abc import Iterable as IterableInstance
 | 
					from collections.abc import Iterable as IterableInstance
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
| 
						 | 
					@ -161,7 +160,6 @@ cdef class Example:
 | 
				
			||||||
                self._y_sig = y_sig
 | 
					                self._y_sig = y_sig
 | 
				
			||||||
                return self._cached_alignment
 | 
					                return self._cached_alignment
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _get_aligned_vectorized(self, align, gold_values):
 | 
					    def _get_aligned_vectorized(self, align, gold_values):
 | 
				
			||||||
        # Fast path for Doc attributes/fields that are predominantly a single value,
 | 
					        # Fast path for Doc attributes/fields that are predominantly a single value,
 | 
				
			||||||
        # i.e., TAG, POS, MORPH.
 | 
					        # i.e., TAG, POS, MORPH.
 | 
				
			||||||
| 
						 | 
					@ -204,7 +202,6 @@ cdef class Example:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return output.tolist()
 | 
					        return output.tolist()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _get_aligned_non_vectorized(self, align, gold_values):
 | 
					    def _get_aligned_non_vectorized(self, align, gold_values):
 | 
				
			||||||
        # Slower path for fields that return multiple values (resulting
 | 
					        # Slower path for fields that return multiple values (resulting
 | 
				
			||||||
        # in ragged arrays that cannot be vectorized trivially).
 | 
					        # in ragged arrays that cannot be vectorized trivially).
 | 
				
			||||||
| 
						 | 
					@ -221,7 +218,6 @@ cdef class Example:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return output
 | 
					        return output
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    def get_aligned(self, field, as_string=False):
 | 
					    def get_aligned(self, field, as_string=False):
 | 
				
			||||||
        """Return an aligned array for a token attribute."""
 | 
					        """Return an aligned array for a token attribute."""
 | 
				
			||||||
        align = self.alignment.x2y
 | 
					        align = self.alignment.x2y
 | 
				
			||||||
| 
						 | 
					@ -330,7 +326,7 @@ cdef class Example:
 | 
				
			||||||
            missing=None
 | 
					            missing=None
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        # Now fill the tokens we can align to O.
 | 
					        # Now fill the tokens we can align to O.
 | 
				
			||||||
        O = 2 # I=1, O=2, B=3
 | 
					        O = 2 # I=1, O=2, B=3  # no-cython-lint: E741
 | 
				
			||||||
        for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
 | 
					        for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
 | 
				
			||||||
            if x_tags[i] is None:
 | 
					            if x_tags[i] is None:
 | 
				
			||||||
                if ent_iob == O:
 | 
					                if ent_iob == O:
 | 
				
			||||||
| 
						 | 
					@ -340,7 +336,7 @@ cdef class Example:
 | 
				
			||||||
        return x_ents, x_tags
 | 
					        return x_ents, x_tags
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_aligned_ner(self):
 | 
					    def get_aligned_ner(self):
 | 
				
			||||||
        x_ents, x_tags = self.get_aligned_ents_and_ner()
 | 
					        _x_ents, x_tags = self.get_aligned_ents_and_ner()
 | 
				
			||||||
        return x_tags
 | 
					        return x_tags
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_matching_ents(self, check_label=True):
 | 
					    def get_matching_ents(self, check_label=True):
 | 
				
			||||||
| 
						 | 
					@ -398,7 +394,6 @@ cdef class Example:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return span_dict
 | 
					        return span_dict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _links_to_dict(self):
 | 
					    def _links_to_dict(self):
 | 
				
			||||||
        links = {}
 | 
					        links = {}
 | 
				
			||||||
        for ent in self.reference.ents:
 | 
					        for ent in self.reference.ents:
 | 
				
			||||||
| 
						 | 
					@ -589,6 +584,7 @@ def _fix_legacy_dict_data(example_dict):
 | 
				
			||||||
        "doc_annotation": doc_dict
 | 
					        "doc_annotation": doc_dict
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _has_field(annot, field):
 | 
					def _has_field(annot, field):
 | 
				
			||||||
    if field not in annot:
 | 
					    if field not in annot:
 | 
				
			||||||
        return False
 | 
					        return False
 | 
				
			||||||
| 
						 | 
					@ -625,6 +621,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
 | 
				
			||||||
                ent_types.append("")
 | 
					                ent_types.append("")
 | 
				
			||||||
    return ent_iobs, ent_types
 | 
					    return ent_iobs, ent_types
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _parse_links(vocab, words, spaces, links):
 | 
					def _parse_links(vocab, words, spaces, links):
 | 
				
			||||||
    reference = Doc(vocab, words=words, spaces=spaces)
 | 
					    reference = Doc(vocab, words=words, spaces=spaces)
 | 
				
			||||||
    starts = {token.idx: token.i for token in reference}
 | 
					    starts = {token.idx: token.i for token in reference}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,3 @@
 | 
				
			||||||
import json
 | 
					 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
| 
						 | 
					@ -6,7 +5,7 @@ import srsly
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..errors import Warnings
 | 
					from ..errors import Warnings
 | 
				
			||||||
from ..tokens import Doc
 | 
					from ..tokens import Doc
 | 
				
			||||||
from .iob_utils import offsets_to_biluo_tags, tags_to_entities
 | 
					from .iob_utils import offsets_to_biluo_tags
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
 | 
					def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
 | 
				
			||||||
| 
						 | 
					@ -23,7 +22,13 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
 | 
				
			||||||
    json_doc = {"id": doc_id, "paragraphs": []}
 | 
					    json_doc = {"id": doc_id, "paragraphs": []}
 | 
				
			||||||
    for i, doc in enumerate(docs):
 | 
					    for i, doc in enumerate(docs):
 | 
				
			||||||
        raw = None if doc.has_unknown_spaces else doc.text
 | 
					        raw = None if doc.has_unknown_spaces else doc.text
 | 
				
			||||||
        json_para = {'raw': raw, "sentences": [], "cats": [], "entities": [], "links": []}
 | 
					        json_para = {
 | 
				
			||||||
 | 
					            'raw': raw,
 | 
				
			||||||
 | 
					            "sentences": [],
 | 
				
			||||||
 | 
					            "cats": [],
 | 
				
			||||||
 | 
					            "entities": [],
 | 
				
			||||||
 | 
					            "links": []
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
        for cat, val in doc.cats.items():
 | 
					        for cat, val in doc.cats.items():
 | 
				
			||||||
            json_cat = {"label": cat, "value": val}
 | 
					            json_cat = {"label": cat, "value": val}
 | 
				
			||||||
            json_para["cats"].append(json_cat)
 | 
					            json_para["cats"].append(json_cat)
 | 
				
			||||||
| 
						 | 
					@ -35,13 +40,17 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
 | 
				
			||||||
            if ent.kb_id_:
 | 
					            if ent.kb_id_:
 | 
				
			||||||
                link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
 | 
					                link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
 | 
				
			||||||
                json_para["links"].append(link_dict)
 | 
					                json_para["links"].append(link_dict)
 | 
				
			||||||
        biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag)
 | 
					        biluo_tags = offsets_to_biluo_tags(
 | 
				
			||||||
 | 
					            doc, json_para["entities"], missing=ner_missing_tag
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
 | 
					        attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
 | 
				
			||||||
        include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
 | 
					        include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
 | 
				
			||||||
        for j, sent in enumerate(doc.sents):
 | 
					        for j, sent in enumerate(doc.sents):
 | 
				
			||||||
            json_sent = {"tokens": [], "brackets": []}
 | 
					            json_sent = {"tokens": [], "brackets": []}
 | 
				
			||||||
            for token in sent:
 | 
					            for token in sent:
 | 
				
			||||||
                json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
 | 
					                json_token = {
 | 
				
			||||||
 | 
					                    "id": token.i, "orth": token.text, "space": token.whitespace_
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
                if include_annotation["TAG"]:
 | 
					                if include_annotation["TAG"]:
 | 
				
			||||||
                    json_token["tag"] = token.tag_
 | 
					                    json_token["tag"] = token.tag_
 | 
				
			||||||
                if include_annotation["POS"]:
 | 
					                if include_annotation["POS"]:
 | 
				
			||||||
| 
						 | 
					@ -125,9 +134,14 @@ def json_to_annotations(doc):
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
                    sent_starts.append(-1)
 | 
					                    sent_starts.append(-1)
 | 
				
			||||||
            if "brackets" in sent:
 | 
					            if "brackets" in sent:
 | 
				
			||||||
                brackets.extend((b["first"] + sent_start_i,
 | 
					                brackets.extend(
 | 
				
			||||||
                                 b["last"] + sent_start_i, b["label"])
 | 
					                    (
 | 
				
			||||||
                                 for b in sent["brackets"])
 | 
					                        b["first"] + sent_start_i,
 | 
				
			||||||
 | 
					                        b["last"] + sent_start_i,
 | 
				
			||||||
 | 
					                        b["label"]
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
 | 
					                    for b in sent["brackets"]
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        example["token_annotation"] = dict(
 | 
					        example["token_annotation"] = dict(
 | 
				
			||||||
            ids=ids,
 | 
					            ids=ids,
 | 
				
			||||||
| 
						 | 
					@ -160,6 +174,7 @@ def json_to_annotations(doc):
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        yield example
 | 
					        yield example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def json_iterate(bytes utf8_str):
 | 
					def json_iterate(bytes utf8_str):
 | 
				
			||||||
    # We should've made these files jsonl...But since we didn't, parse out
 | 
					    # We should've made these files jsonl...But since we didn't, parse out
 | 
				
			||||||
    # the docs one-by-one to reduce memory usage.
 | 
					    # the docs one-by-one to reduce memory usage.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,10 +1,8 @@
 | 
				
			||||||
cimport numpy as np
 | 
					 | 
				
			||||||
from cython.operator cimport dereference as deref
 | 
					from cython.operator cimport dereference as deref
 | 
				
			||||||
from libc.stdint cimport uint32_t, uint64_t
 | 
					from libc.stdint cimport uint32_t, uint64_t
 | 
				
			||||||
from libcpp.set cimport set as cppset
 | 
					from libcpp.set cimport set as cppset
 | 
				
			||||||
from murmurhash.mrmr cimport hash128_x64
 | 
					from murmurhash.mrmr cimport hash128_x64
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import functools
 | 
					 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
from enum import Enum
 | 
					from enum import Enum
 | 
				
			||||||
from typing import cast
 | 
					from typing import cast
 | 
				
			||||||
| 
						 | 
					@ -262,8 +260,7 @@ cdef class Vectors:
 | 
				
			||||||
        return (
 | 
					        return (
 | 
				
			||||||
            self.shape == other.shape
 | 
					            self.shape == other.shape
 | 
				
			||||||
            and self.key2row == other.key2row
 | 
					            and self.key2row == other.key2row
 | 
				
			||||||
                and self.to_bytes(exclude=["strings"])
 | 
					            and self.to_bytes(exclude=["strings"]) == other.to_bytes(exclude=["strings"])
 | 
				
			||||||
                  == other.to_bytes(exclude=["strings"])
 | 
					 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def resize(self, shape, inplace=False):
 | 
					    def resize(self, shape, inplace=False):
 | 
				
			||||||
| 
						 | 
					@ -524,7 +521,8 @@ cdef class Vectors:
 | 
				
			||||||
            scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:, -n:]
 | 
					            scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:, -n:]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if sort and n >= 2:
 | 
					            if sort and n >= 2:
 | 
				
			||||||
                sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1]
 | 
					                sorted_index = xp.arange(scores.shape[0])[:, None][i:i+batch_size], \
 | 
				
			||||||
 | 
					                    xp.argsort(scores[i:i+batch_size], axis=1)[:, ::-1]
 | 
				
			||||||
                scores[i:i+batch_size] = scores[sorted_index]
 | 
					                scores[i:i+batch_size] = scores[sorted_index]
 | 
				
			||||||
                best_rows[i:i+batch_size] = best_rows[sorted_index]
 | 
					                best_rows[i:i+batch_size] = best_rows[sorted_index]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -538,8 +536,12 @@ cdef class Vectors:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        numpy_rows = get_current_ops().to_numpy(best_rows)
 | 
					        numpy_rows = get_current_ops().to_numpy(best_rows)
 | 
				
			||||||
        keys = xp.asarray(
 | 
					        keys = xp.asarray(
 | 
				
			||||||
            [[row2key[row] for row in numpy_rows[i] if row in row2key]
 | 
					            [
 | 
				
			||||||
                    for i in range(len(queries)) ], dtype="uint64")
 | 
					                [row2key[row] for row in numpy_rows[i] if row in row2key]
 | 
				
			||||||
 | 
					                for i in range(len(queries))
 | 
				
			||||||
 | 
					            ],
 | 
				
			||||||
 | 
					            dtype="uint64"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        return (keys, best_rows, scores)
 | 
					        return (keys, best_rows, scores)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def to_ops(self, ops: Ops):
 | 
					    def to_ops(self, ops: Ops):
 | 
				
			||||||
| 
						 | 
					@ -582,9 +584,9 @@ cdef class Vectors:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        xp = get_array_module(self.data)
 | 
					        xp = get_array_module(self.data)
 | 
				
			||||||
        if xp is numpy:
 | 
					        if xp is numpy:
 | 
				
			||||||
            save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
 | 
					            save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)  # no-cython-lint
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            save_array = lambda arr, file_: xp.save(file_, arr)
 | 
					            save_array = lambda arr, file_: xp.save(file_, arr)  # no-cython-lint
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def save_vectors(path):
 | 
					        def save_vectors(path):
 | 
				
			||||||
            # the source of numpy.save indicates that the file object is closed after use.
 | 
					            # the source of numpy.save indicates that the file object is closed after use.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,4 @@
 | 
				
			||||||
# cython: profile=True
 | 
					# cython: profile=True
 | 
				
			||||||
from libc.string cimport memcpy
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import functools
 | 
					import functools
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
| 
						 | 
					@ -19,7 +17,6 @@ from .errors import Errors
 | 
				
			||||||
from .lang.lex_attrs import LEX_ATTRS, get_lang, is_stop
 | 
					from .lang.lex_attrs import LEX_ATTRS, get_lang, is_stop
 | 
				
			||||||
from .lang.norm_exceptions import BASE_NORMS
 | 
					from .lang.norm_exceptions import BASE_NORMS
 | 
				
			||||||
from .lookups import Lookups
 | 
					from .lookups import Lookups
 | 
				
			||||||
from .util import registry
 | 
					 | 
				
			||||||
from .vectors import Mode as VectorsMode
 | 
					from .vectors import Mode as VectorsMode
 | 
				
			||||||
from .vectors import Vectors
 | 
					from .vectors import Vectors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -51,9 +48,17 @@ cdef class Vocab:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DOCS: https://spacy.io/api/vocab
 | 
					    DOCS: https://spacy.io/api/vocab
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
 | 
					    def __init__(
 | 
				
			||||||
                 oov_prob=-20., vectors_name=None, writing_system={},
 | 
					        self,
 | 
				
			||||||
                 get_noun_chunks=None, **deprecated_kwargs):
 | 
					        lex_attr_getters=None,
 | 
				
			||||||
 | 
					        strings=tuple(),
 | 
				
			||||||
 | 
					        lookups=None,
 | 
				
			||||||
 | 
					        oov_prob=-20.,
 | 
				
			||||||
 | 
					        vectors_name=None,
 | 
				
			||||||
 | 
					        writing_system={},  # no-cython-lint
 | 
				
			||||||
 | 
					        get_noun_chunks=None,
 | 
				
			||||||
 | 
					        **deprecated_kwargs
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
        """Create the vocabulary.
 | 
					        """Create the vocabulary.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        lex_attr_getters (dict): A dictionary mapping attribute IDs to
 | 
					        lex_attr_getters (dict): A dictionary mapping attribute IDs to
 | 
				
			||||||
| 
						 | 
					@ -150,7 +155,6 @@ cdef class Vocab:
 | 
				
			||||||
        cdef LexemeC* lex
 | 
					        cdef LexemeC* lex
 | 
				
			||||||
        cdef hash_t key = self.strings[string]
 | 
					        cdef hash_t key = self.strings[string]
 | 
				
			||||||
        lex = <LexemeC*>self._by_orth.get(key)
 | 
					        lex = <LexemeC*>self._by_orth.get(key)
 | 
				
			||||||
        cdef size_t addr
 | 
					 | 
				
			||||||
        if lex != NULL:
 | 
					        if lex != NULL:
 | 
				
			||||||
            assert lex.orth in self.strings
 | 
					            assert lex.orth in self.strings
 | 
				
			||||||
            if lex.orth != key:
 | 
					            if lex.orth != key:
 | 
				
			||||||
| 
						 | 
					@ -463,7 +467,6 @@ cdef class Vocab:
 | 
				
			||||||
                    self.lookups.get_table("lexeme_norm"),
 | 
					                    self.lookups.get_table("lexeme_norm"),
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    def to_disk(self, path, *, exclude=tuple()):
 | 
					    def to_disk(self, path, *, exclude=tuple()):
 | 
				
			||||||
        """Save the current state to a directory.
 | 
					        """Save the current state to a directory.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -476,7 +479,6 @@ cdef class Vocab:
 | 
				
			||||||
        path = util.ensure_path(path)
 | 
					        path = util.ensure_path(path)
 | 
				
			||||||
        if not path.exists():
 | 
					        if not path.exists():
 | 
				
			||||||
            path.mkdir()
 | 
					            path.mkdir()
 | 
				
			||||||
        setters = ["strings", "vectors"]
 | 
					 | 
				
			||||||
        if "strings" not in exclude:
 | 
					        if "strings" not in exclude:
 | 
				
			||||||
            self.strings.to_disk(path / "strings.json")
 | 
					            self.strings.to_disk(path / "strings.json")
 | 
				
			||||||
        if "vectors" not in exclude:
 | 
					        if "vectors" not in exclude:
 | 
				
			||||||
| 
						 | 
					@ -495,7 +497,6 @@ cdef class Vocab:
 | 
				
			||||||
        DOCS: https://spacy.io/api/vocab#to_disk
 | 
					        DOCS: https://spacy.io/api/vocab#to_disk
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        path = util.ensure_path(path)
 | 
					        path = util.ensure_path(path)
 | 
				
			||||||
        getters = ["strings", "vectors"]
 | 
					 | 
				
			||||||
        if "strings" not in exclude:
 | 
					        if "strings" not in exclude:
 | 
				
			||||||
            self.strings.from_disk(path / "strings.json")  # TODO: add exclude?
 | 
					            self.strings.from_disk(path / "strings.json")  # TODO: add exclude?
 | 
				
			||||||
        if "vectors" not in exclude:
 | 
					        if "vectors" not in exclude:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -856,7 +856,7 @@ token-to-vector embedding component like [`Tok2Vec`](/api/tok2vec) or
 | 
				
			||||||
training a pipeline with components sourced from an existing pipeline: if
 | 
					training a pipeline with components sourced from an existing pipeline: if
 | 
				
			||||||
multiple components (e.g. tagger, parser, NER) listen to the same
 | 
					multiple components (e.g. tagger, parser, NER) listen to the same
 | 
				
			||||||
token-to-vector component, but some of them are frozen and not updated, their
 | 
					token-to-vector component, but some of them are frozen and not updated, their
 | 
				
			||||||
performance may degrade significally as the token-to-vector component is updated
 | 
					performance may degrade significantly as the token-to-vector component is updated
 | 
				
			||||||
with new data. To prevent this, listeners can be replaced with a standalone
 | 
					with new data. To prevent this, listeners can be replaced with a standalone
 | 
				
			||||||
token-to-vector layer that is owned by the component and doesn't change if the
 | 
					token-to-vector layer that is owned by the component and doesn't change if the
 | 
				
			||||||
component isn't updated.
 | 
					component isn't updated.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -60,7 +60,7 @@ architectures and their arguments and hyperparameters.
 | 
				
			||||||
| `model`      | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~                                                                                           |
 | 
					| `model`      | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~                                                                                           |
 | 
				
			||||||
| `spans_key`  | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
 | 
					| `spans_key`  | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
 | 
				
			||||||
| `threshold`  | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~                                                                                                                                    |
 | 
					| `threshold`  | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~                                                                                                                                    |
 | 
				
			||||||
| `max_length` | Maximum length of the produced spans, defaults to `None` meaning unlimited length. ~~Optional[int]~~                                                                                                                   |
 | 
					| `max_length` | Maximum length of the produced spans, defaults to `25`. ~~Optional[int]~~                                                                                                                                              |
 | 
				
			||||||
| `min_length` | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~                                                                                                          |
 | 
					| `min_length` | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~                                                                                                          |
 | 
				
			||||||
| `scorer`     | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~                                                      |
 | 
					| `scorer`     | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~                                                      |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -310,7 +310,7 @@ You can configure the build process with the following environment variables:
 | 
				
			||||||
| Variable       | Description                                                                                                                                                                                                 |
 | 
					| Variable       | Description                                                                                                                                                                                                 |
 | 
				
			||||||
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `SPACY_EXTRAS` | Additional Python packages to install alongside spaCy with optional version specifications. Should be a string that can be passed to `pip install`. See [`Makefile`](%%GITHUB_SPACY/Makefile) for defaults. |
 | 
					| `SPACY_EXTRAS` | Additional Python packages to install alongside spaCy with optional version specifications. Should be a string that can be passed to `pip install`. See [`Makefile`](%%GITHUB_SPACY/Makefile) for defaults. |
 | 
				
			||||||
| `PYVER`        | The Python version to build against. This version needs to be available on your build and runtime machines. Defaults to `3.6`.                                                                              |
 | 
					| `PYVER`        | The Python version to build against. This version needs to be available on your build and runtime machines. Defaults to `3.8`.                                                                              |
 | 
				
			||||||
| `WHEELHOUSE`   | Directory to store the wheel files during compilation. Defaults to `./wheelhouse`.                                                                                                                          |
 | 
					| `WHEELHOUSE`   | Directory to store the wheel files during compilation. Defaults to `./wheelhouse`.                                                                                                                          |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Run tests {id="run-tests"}
 | 
					### Run tests {id="run-tests"}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -113,7 +113,7 @@ print(doc[2].morph)  # 'Case=Nom|Person=2|PronType=Prs'
 | 
				
			||||||
print(doc[2].pos_)  # 'PRON'
 | 
					print(doc[2].pos_)  # 'PRON'
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Lemmatization {id="lemmatization",model="lemmatizer",version="3"}
 | 
					## Lemmatization {id="lemmatization",version="3"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
spaCy provides two pipeline components for lemmatization:
 | 
					spaCy provides two pipeline components for lemmatization:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -170,7 +170,7 @@ nlp = spacy.blank("sv")
 | 
				
			||||||
nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
 | 
					nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Rule-based lemmatizer {id="lemmatizer-rule"}
 | 
					### Rule-based lemmatizer {id="lemmatizer-rule",model="morphologizer"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
When training pipelines that include a component that assigns part-of-speech
 | 
					When training pipelines that include a component that assigns part-of-speech
 | 
				
			||||||
tags (a morphologizer or a tagger with a [POS mapping](#mappings-exceptions)), a
 | 
					tags (a morphologizer or a tagger with a [POS mapping](#mappings-exceptions)), a
 | 
				
			||||||
| 
						 | 
					@ -194,7 +194,7 @@ information, without consulting the context of the token. The rule-based
 | 
				
			||||||
lemmatizer also accepts list-based exception files. For English, these are
 | 
					lemmatizer also accepts list-based exception files. For English, these are
 | 
				
			||||||
acquired from [WordNet](https://wordnet.princeton.edu/).
 | 
					acquired from [WordNet](https://wordnet.princeton.edu/).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Trainable lemmatizer
 | 
					### Trainable lemmatizer {id="lemmatizer-train",model="trainable_lemmatizer"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The [`EditTreeLemmatizer`](/api/edittreelemmatizer) can learn form-to-lemma
 | 
					The [`EditTreeLemmatizer`](/api/edittreelemmatizer) can learn form-to-lemma
 | 
				
			||||||
transformations from a training corpus that includes lemma annotations. This
 | 
					transformations from a training corpus that includes lemma annotations. This
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -27,7 +27,7 @@
 | 
				
			||||||
        "indexName": "spacy"
 | 
					        "indexName": "spacy"
 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
    "binderUrl": "explosion/spacy-io-binder",
 | 
					    "binderUrl": "explosion/spacy-io-binder",
 | 
				
			||||||
    "binderVersion": "3.5",
 | 
					    "binderVersion": "3.6",
 | 
				
			||||||
    "sections": [
 | 
					    "sections": [
 | 
				
			||||||
        { "id": "usage", "title": "Usage Documentation", "theme": "blue" },
 | 
					        { "id": "usage", "title": "Usage Documentation", "theme": "blue" },
 | 
				
			||||||
        { "id": "models", "title": "Models Documentation", "theme": "blue" },
 | 
					        { "id": "models", "title": "Models Documentation", "theme": "blue" },
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user