mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge branch 'upstream_master' into sync_develop
This commit is contained in:
commit
79ec68f01b
6
.github/workflows/tests.yml
vendored
6
.github/workflows/tests.yml
vendored
|
@ -45,6 +45,12 @@ jobs:
|
|||
run: |
|
||||
python -m pip install flake8==5.0.4
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||
- name: cython-lint
|
||||
run: |
|
||||
python -m pip install cython-lint -c requirements.txt
|
||||
# E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
|
||||
cython-lint spacy --ignore E501,W291,E266
|
||||
|
||||
tests:
|
||||
name: Test
|
||||
needs: Validate
|
||||
|
|
4
Makefile
4
Makefile
|
@ -1,11 +1,11 @@
|
|||
SHELL := /bin/bash
|
||||
|
||||
ifndef SPACY_EXTRAS
|
||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2
|
||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.3
|
||||
endif
|
||||
|
||||
ifndef PYVER
|
||||
override PYVER = 3.6
|
||||
override PYVER = 3.8
|
||||
endif
|
||||
|
||||
VENV := ./env$(PYVER)
|
||||
|
|
|
@ -39,4 +39,5 @@ types-setuptools>=57.0.0
|
|||
types-requests
|
||||
types-setuptools>=57.0.0
|
||||
black==22.3.0
|
||||
cython-lint>=0.15.0; python_version >= "3.7"
|
||||
isort>=5.0,<6.0
|
||||
|
|
|
@ -96,4 +96,4 @@ cdef enum attr_id_t:
|
|||
ENT_ID = symbols.ENT_ID
|
||||
|
||||
IDX
|
||||
SENT_END
|
||||
SENT_END
|
||||
|
|
|
@ -117,7 +117,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
if "pos" in stringy_attrs:
|
||||
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
||||
if "morph" in stringy_attrs:
|
||||
morphs = stringy_attrs.pop("morph")
|
||||
morphs = stringy_attrs.pop("morph") # no-cython-lint
|
||||
if "number" in stringy_attrs:
|
||||
stringy_attrs.pop("number")
|
||||
if "tenspect" in stringy_attrs:
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import itertools
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
|
@ -218,7 +217,7 @@ class SpanRenderer:
|
|||
+ (self.offset_step * (len(entities) - 1))
|
||||
)
|
||||
markup += self.span_template.format(
|
||||
text=token["text"],
|
||||
text=escape_html(token["text"]),
|
||||
span_slices=slices,
|
||||
span_starts=starts,
|
||||
total_height=total_height,
|
||||
|
|
|
@ -4,7 +4,8 @@ from ..typedefs cimport hash_t
|
|||
from .kb cimport KnowledgeBase
|
||||
|
||||
|
||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
||||
# Object used by the Entity Linker that summarizes one entity-alias candidate
|
||||
# combination.
|
||||
cdef class Candidate:
|
||||
cdef readonly KnowledgeBase kb
|
||||
cdef hash_t entity_hash
|
||||
|
|
|
@ -8,15 +8,24 @@ from ..tokens import Span
|
|||
|
||||
|
||||
cdef class Candidate:
|
||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
||||
algorithm which will disambiguate the various candidates to the correct one.
|
||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or
|
||||
may not be resolved to a specific `entity` from a Knowledge Base. This
|
||||
will be used as input for the entity linking algorithm which will
|
||||
disambiguate the various candidates to the correct one.
|
||||
Each candidate (alias, entity) pair is assigned a certain prior probability.
|
||||
|
||||
DOCS: https://spacy.io/api/kb/#candidate-init
|
||||
"""
|
||||
|
||||
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
||||
def __init__(
|
||||
self,
|
||||
KnowledgeBase kb,
|
||||
entity_hash,
|
||||
entity_freq,
|
||||
entity_vector,
|
||||
alias_hash,
|
||||
prior_prob
|
||||
):
|
||||
self.kb = kb
|
||||
self.entity_hash = entity_hash
|
||||
self.entity_freq = entity_freq
|
||||
|
@ -59,7 +68,8 @@ cdef class Candidate:
|
|||
|
||||
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for a given mention and fetching appropriate entries from the index.
|
||||
Return candidate entities for a given mention and fetching appropriate
|
||||
entries from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (Span): Entity mention for which to identify candidates.
|
||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||
|
@ -67,9 +77,12 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
|||
return kb.get_candidates(mention)
|
||||
|
||||
|
||||
def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
||||
def get_candidates_batch(
|
||||
kb: KnowledgeBase, mentions: Iterable[Span]
|
||||
) -> Iterable[Iterable[Candidate]]:
|
||||
"""
|
||||
Return candidate entities for the given mentions and fetching appropriate entries from the index.
|
||||
Return candidate entities for the given mentions and fetching appropriate entries
|
||||
from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (Iterable[Span]): Entity mentions for which to identify candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
|
|
|
@ -12,8 +12,9 @@ from .candidate import Candidate
|
|||
|
||||
|
||||
cdef class KnowledgeBase:
|
||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
||||
to support entity linking of named entities to real-world concepts.
|
||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and
|
||||
their textual aliases, to support entity linking of named entities to
|
||||
real-world concepts.
|
||||
This is an abstract class and requires its operations to be implemented.
|
||||
|
||||
DOCS: https://spacy.io/api/kb
|
||||
|
@ -31,10 +32,13 @@ cdef class KnowledgeBase:
|
|||
self.entity_vector_length = entity_vector_length
|
||||
self.mem = Pool()
|
||||
|
||||
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
||||
def get_candidates_batch(
|
||||
self, mentions: Iterable[Span]
|
||||
) -> Iterable[Iterable[Candidate]]:
|
||||
"""
|
||||
Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
Return candidate entities for specified texts. Each candidate defines
|
||||
the entity, the original alias, and the prior probability of that
|
||||
alias resolving to that entity.
|
||||
If no candidate is found for a given text, an empty list is returned.
|
||||
mentions (Iterable[Span]): Mentions for which to get candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
|
@ -43,14 +47,17 @@ cdef class KnowledgeBase:
|
|||
|
||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for specified text. Each candidate defines the entity, the original alias,
|
||||
Return candidate entities for specified text. Each candidate defines
|
||||
the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
If the no candidate is found for a given text, an empty list is returned.
|
||||
mention (Span): Mention for which to get candidates.
|
||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="get_candidates", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
||||
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
|
||||
|
@ -68,7 +75,9 @@ cdef class KnowledgeBase:
|
|||
RETURNS (Iterable[float]): Vector for specified entity.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="get_vector", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
||||
def to_bytes(self, **kwargs) -> bytes:
|
||||
|
@ -76,7 +85,9 @@ cdef class KnowledgeBase:
|
|||
RETURNS (bytes): Current state as binary string.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="to_bytes", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
||||
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
|
||||
|
@ -85,25 +96,35 @@ cdef class KnowledgeBase:
|
|||
exclude (Tuple[str]): Properties to exclude when restoring KB.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="from_bytes", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
||||
def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> None:
|
||||
"""
|
||||
Write KnowledgeBase content to disk.
|
||||
path (Union[str, Path]): Target file path.
|
||||
exclude (Iterable[str]): List of components to exclude.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="to_disk", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
||||
def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
||||
def from_disk(
|
||||
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> None:
|
||||
"""
|
||||
Load KnowledgeBase content from disk.
|
||||
path (Union[str, Path]): Target file path.
|
||||
exclude (Iterable[str]): List of components to exclude.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="from_disk", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
|
|
@ -55,23 +55,28 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
# optional data, we can let users configure a DB as the backend for this.
|
||||
cdef object _features_table
|
||||
|
||||
|
||||
cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
|
||||
"""Add an entity vector to the vectors table."""
|
||||
cdef int64_t new_index = self._vectors_table.size()
|
||||
self._vectors_table.push_back(entity_vector)
|
||||
return new_index
|
||||
|
||||
|
||||
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
|
||||
int32_t vector_index, int feats_row) nogil:
|
||||
cdef inline int64_t c_add_entity(
|
||||
self,
|
||||
hash_t entity_hash,
|
||||
float freq,
|
||||
int32_t vector_index,
|
||||
int feats_row
|
||||
) nogil:
|
||||
"""Add an entry to the vector of entries.
|
||||
After calling this method, make sure to update also the _entry_index using the return value"""
|
||||
After calling this method, make sure to update also the _entry_index
|
||||
using the return value"""
|
||||
# This is what we'll map the entity hash key to. It's where the entry will sit
|
||||
# in the vector of entries, so we can get it later.
|
||||
cdef int64_t new_index = self._entries.size()
|
||||
|
||||
# Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
|
||||
# Avoid struct initializer to enable nogil, cf.
|
||||
# https://github.com/cython/cython/issues/1642
|
||||
cdef KBEntryC entry
|
||||
entry.entity_hash = entity_hash
|
||||
entry.vector_index = vector_index
|
||||
|
@ -81,11 +86,17 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
self._entries.push_back(entry)
|
||||
return new_index
|
||||
|
||||
cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil:
|
||||
"""Connect a mention to a list of potential entities with their prior probabilities .
|
||||
After calling this method, make sure to update also the _alias_index using the return value"""
|
||||
# This is what we'll map the alias hash key to. It's where the alias will be defined
|
||||
# in the vector of aliases.
|
||||
cdef inline int64_t c_add_aliases(
|
||||
self,
|
||||
hash_t alias_hash,
|
||||
vector[int64_t] entry_indices,
|
||||
vector[float] probs
|
||||
) nogil:
|
||||
"""Connect a mention to a list of potential entities with their prior
|
||||
probabilities. After calling this method, make sure to update also the
|
||||
_alias_index using the return value"""
|
||||
# This is what we'll map the alias hash key to. It's where the alias will be
|
||||
# defined in the vector of aliases.
|
||||
cdef int64_t new_index = self._aliases_table.size()
|
||||
|
||||
# Avoid struct initializer to enable nogil
|
||||
|
@ -98,8 +109,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
|
||||
"""
|
||||
Initializing the vectors and making sure the first element of each vector is a dummy,
|
||||
because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
|
||||
Initializing the vectors and making sure the first element of each vector is a
|
||||
dummy, because the PreshMap maps pointing to indices in these vectors can not
|
||||
contain 0 as value.
|
||||
cf. https://github.com/explosion/preshed/issues/17
|
||||
"""
|
||||
cdef int32_t dummy_value = 0
|
||||
|
@ -130,12 +142,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
cdef class Writer:
|
||||
cdef FILE* _fp
|
||||
|
||||
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
|
||||
cdef int write_header(
|
||||
self, int64_t nr_entries, int64_t entity_vector_length
|
||||
) except -1
|
||||
cdef int write_vector_element(self, float element) except -1
|
||||
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
|
||||
cdef int write_entry(
|
||||
self, hash_t entry_hash, float entry_freq, int32_t vector_index
|
||||
) except -1
|
||||
|
||||
cdef int write_alias_length(self, int64_t alias_length) except -1
|
||||
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
|
||||
cdef int write_alias_header(
|
||||
self, hash_t alias_hash, int64_t candidate_length
|
||||
) except -1
|
||||
cdef int write_alias(self, int64_t entry_index, float prob) except -1
|
||||
|
||||
cdef int _write(self, void* value, size_t size) except -1
|
||||
|
@ -143,12 +161,18 @@ cdef class Writer:
|
|||
cdef class Reader:
|
||||
cdef FILE* _fp
|
||||
|
||||
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
|
||||
cdef int read_header(
|
||||
self, int64_t* nr_entries, int64_t* entity_vector_length
|
||||
) except -1
|
||||
cdef int read_vector_element(self, float* element) except -1
|
||||
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
|
||||
cdef int read_entry(
|
||||
self, hash_t* entity_hash, float* freq, int32_t* vector_index
|
||||
) except -1
|
||||
|
||||
cdef int read_alias_length(self, int64_t* alias_length) except -1
|
||||
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
|
||||
cdef int read_alias_header(
|
||||
self, hash_t* alias_hash, int64_t* candidate_length
|
||||
) except -1
|
||||
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
|
||||
|
||||
cdef int _read(self, void* value, size_t size) except -1
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# cython: infer_types=True, profile=True
|
||||
from typing import Any, Callable, Dict, Iterable, Union
|
||||
from typing import Any, Callable, Dict, Iterable
|
||||
|
||||
import srsly
|
||||
|
||||
|
@ -27,8 +27,9 @@ from .candidate import Candidate as Candidate
|
|||
|
||||
|
||||
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
|
||||
to support entity linking of named entities to real-world concepts.
|
||||
"""An `InMemoryLookupKB` instance stores unique identifiers for entities
|
||||
and their textual aliases, to support entity linking of named entities to
|
||||
real-world concepts.
|
||||
|
||||
DOCS: https://spacy.io/api/inmemorylookupkb
|
||||
"""
|
||||
|
@ -71,7 +72,8 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
def add_entity(self, str entity, float freq, vector[float] entity_vector):
|
||||
"""
|
||||
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
||||
Add an entity to the KB, optionally specifying its log probability
|
||||
based on corpus frequency.
|
||||
Return the hash of the entity ID/name at the end.
|
||||
"""
|
||||
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
||||
|
@ -83,14 +85,20 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
# Raise an error if the provided entity vector is not of the correct length
|
||||
if len(entity_vector) != self.entity_vector_length:
|
||||
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
|
||||
raise ValueError(
|
||||
Errors.E141.format(
|
||||
found=len(entity_vector), required=self.entity_vector_length
|
||||
)
|
||||
)
|
||||
|
||||
vector_index = self.c_add_vector(entity_vector=entity_vector)
|
||||
|
||||
new_index = self.c_add_entity(entity_hash=entity_hash,
|
||||
freq=freq,
|
||||
vector_index=vector_index,
|
||||
feats_row=-1) # Features table currently not implemented
|
||||
new_index = self.c_add_entity(
|
||||
entity_hash=entity_hash,
|
||||
freq=freq,
|
||||
vector_index=vector_index,
|
||||
feats_row=-1
|
||||
) # Features table currently not implemented
|
||||
self._entry_index[entity_hash] = new_index
|
||||
|
||||
return entity_hash
|
||||
|
@ -115,7 +123,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
else:
|
||||
entity_vector = vector_list[i]
|
||||
if len(entity_vector) != self.entity_vector_length:
|
||||
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
|
||||
raise ValueError(
|
||||
Errors.E141.format(
|
||||
found=len(entity_vector),
|
||||
required=self.entity_vector_length
|
||||
)
|
||||
)
|
||||
|
||||
entry.entity_hash = entity_hash
|
||||
entry.freq = freq_list[i]
|
||||
|
@ -149,11 +162,15 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
previous_alias_nr = self.get_size_aliases()
|
||||
# Throw an error if the length of entities and probabilities are not the same
|
||||
if not len(entities) == len(probabilities):
|
||||
raise ValueError(Errors.E132.format(alias=alias,
|
||||
entities_length=len(entities),
|
||||
probabilities_length=len(probabilities)))
|
||||
raise ValueError(
|
||||
Errors.E132.format(
|
||||
alias=alias,
|
||||
entities_length=len(entities),
|
||||
probabilities_length=len(probabilities))
|
||||
)
|
||||
|
||||
# Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors)
|
||||
# Throw an error if the probabilities sum up to more than 1 (allow for
|
||||
# some rounding errors)
|
||||
prob_sum = sum(probabilities)
|
||||
if prob_sum > 1.00001:
|
||||
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
|
||||
|
@ -170,40 +187,47 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
for entity, prob in zip(entities, probabilities):
|
||||
entity_hash = self.vocab.strings[entity]
|
||||
if not entity_hash in self._entry_index:
|
||||
if entity_hash not in self._entry_index:
|
||||
raise ValueError(Errors.E134.format(entity=entity))
|
||||
|
||||
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
||||
entry_indices.push_back(int(entry_index))
|
||||
probs.push_back(float(prob))
|
||||
|
||||
new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
|
||||
new_index = self.c_add_aliases(
|
||||
alias_hash=alias_hash, entry_indices=entry_indices, probs=probs
|
||||
)
|
||||
self._alias_index[alias_hash] = new_index
|
||||
|
||||
if previous_alias_nr + 1 != self.get_size_aliases():
|
||||
raise RuntimeError(Errors.E891.format(alias=alias))
|
||||
return alias_hash
|
||||
|
||||
def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
|
||||
def append_alias(
|
||||
self, str alias, str entity, float prior_prob, ignore_warnings=False
|
||||
):
|
||||
"""
|
||||
For an alias already existing in the KB, extend its potential entities with one more.
|
||||
For an alias already existing in the KB, extend its potential entities
|
||||
with one more.
|
||||
Throw a warning if either the alias or the entity is unknown,
|
||||
or when the combination is already previously recorded.
|
||||
Throw an error if this entity+prior prob would exceed the sum of 1.
|
||||
For efficiency, it's best to use the method `add_alias` as much as possible instead of this one.
|
||||
For efficiency, it's best to use the method `add_alias` as much as
|
||||
possible instead of this one.
|
||||
"""
|
||||
# Check if the alias exists in the KB
|
||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||
if not alias_hash in self._alias_index:
|
||||
if alias_hash not in self._alias_index:
|
||||
raise ValueError(Errors.E176.format(alias=alias))
|
||||
|
||||
# Check if the entity exists in the KB
|
||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||
if not entity_hash in self._entry_index:
|
||||
if entity_hash not in self._entry_index:
|
||||
raise ValueError(Errors.E134.format(entity=entity))
|
||||
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
||||
|
||||
# Throw an error if the prior probabilities (including the new one) sum up to more than 1
|
||||
# Throw an error if the prior probabilities (including the new one)
|
||||
# sum up to more than 1
|
||||
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
||||
alias_entry = self._aliases_table[alias_index]
|
||||
current_sum = sum([p for p in alias_entry.probs])
|
||||
|
@ -236,12 +260,13 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
Return candidate entities for an alias. Each candidate defines the
|
||||
entity, the original alias, and the prior probability of that alias
|
||||
resolving to that entity.
|
||||
If the alias is not known in the KB, and empty list is returned.
|
||||
"""
|
||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||
if not alias_hash in self._alias_index:
|
||||
if alias_hash not in self._alias_index:
|
||||
return []
|
||||
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
||||
alias_entry = self._aliases_table[alias_index]
|
||||
|
@ -249,10 +274,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
return [Candidate(kb=self,
|
||||
entity_hash=self._entries[entry_index].entity_hash,
|
||||
entity_freq=self._entries[entry_index].freq,
|
||||
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
|
||||
entity_vector=self._vectors_table[
|
||||
self._entries[entry_index].vector_index
|
||||
],
|
||||
alias_hash=alias_hash,
|
||||
prior_prob=prior_prob)
|
||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
||||
for (entry_index, prior_prob) in zip(
|
||||
alias_entry.entry_indices, alias_entry.probs
|
||||
)
|
||||
if entry_index != 0]
|
||||
|
||||
def get_vector(self, str entity):
|
||||
|
@ -266,8 +295,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
return self._vectors_table[self._entries[entry_index].vector_index]
|
||||
|
||||
def get_prior_prob(self, str entity, str alias):
|
||||
""" Return the prior probability of a given alias being linked to a given entity,
|
||||
or return 0.0 when this combination is not known in the knowledge base"""
|
||||
""" Return the prior probability of a given alias being linked to a
|
||||
given entity, or return 0.0 when this combination is not known in the
|
||||
knowledge base."""
|
||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||
|
||||
|
@ -278,7 +308,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
entry_index = self._entry_index[entity_hash]
|
||||
|
||||
alias_entry = self._aliases_table[alias_index]
|
||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs):
|
||||
for (entry_index, prior_prob) in zip(
|
||||
alias_entry.entry_indices, alias_entry.probs
|
||||
):
|
||||
if self._entries[entry_index].entity_hash == entity_hash:
|
||||
return prior_prob
|
||||
|
||||
|
@ -288,13 +320,19 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
"""Serialize the current state to a binary string.
|
||||
"""
|
||||
def serialize_header():
|
||||
header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length)
|
||||
header = (
|
||||
self.get_size_entities(),
|
||||
self.get_size_aliases(),
|
||||
self.entity_vector_length
|
||||
)
|
||||
return srsly.json_dumps(header)
|
||||
|
||||
def serialize_entries():
|
||||
i = 1
|
||||
tuples = []
|
||||
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
|
||||
for entry_hash, entry_index in sorted(
|
||||
self._entry_index.items(), key=lambda x: x[1]
|
||||
):
|
||||
entry = self._entries[entry_index]
|
||||
assert entry.entity_hash == entry_hash
|
||||
assert entry_index == i
|
||||
|
@ -307,7 +345,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
headers = []
|
||||
indices_lists = []
|
||||
probs_lists = []
|
||||
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
|
||||
for alias_hash, alias_index in sorted(
|
||||
self._alias_index.items(), key=lambda x: x[1]
|
||||
):
|
||||
alias = self._aliases_table[alias_index]
|
||||
assert alias_index == i
|
||||
candidate_length = len(alias.entry_indices)
|
||||
|
@ -365,7 +405,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
indices = srsly.json_loads(all_data[1])
|
||||
probs = srsly.json_loads(all_data[2])
|
||||
for header, indices, probs in zip(headers, indices, probs):
|
||||
alias_hash, candidate_length = header
|
||||
alias_hash, _candidate_length = header
|
||||
alias.entry_indices = indices
|
||||
alias.probs = probs
|
||||
self._aliases_table[i] = alias
|
||||
|
@ -414,10 +454,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
writer.write_vector_element(element)
|
||||
i = i+1
|
||||
|
||||
# dumping the entry records in the order in which they are in the _entries vector.
|
||||
# index 0 is a dummy object not stored in the _entry_index and can be ignored.
|
||||
# dumping the entry records in the order in which they are in the
|
||||
# _entries vector.
|
||||
# index 0 is a dummy object not stored in the _entry_index and can
|
||||
# be ignored.
|
||||
i = 1
|
||||
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
|
||||
for entry_hash, entry_index in sorted(
|
||||
self._entry_index.items(), key=lambda x: x[1]
|
||||
):
|
||||
entry = self._entries[entry_index]
|
||||
assert entry.entity_hash == entry_hash
|
||||
assert entry_index == i
|
||||
|
@ -429,7 +473,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
# dumping the aliases in the order in which they are in the _alias_index vector.
|
||||
# index 0 is a dummy object not stored in the _aliases_table and can be ignored.
|
||||
i = 1
|
||||
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
|
||||
for alias_hash, alias_index in sorted(
|
||||
self._alias_index.items(), key=lambda x: x[1]
|
||||
):
|
||||
alias = self._aliases_table[alias_index]
|
||||
assert alias_index == i
|
||||
|
||||
|
@ -535,7 +581,8 @@ cdef class Writer:
|
|||
def __init__(self, path):
|
||||
assert isinstance(path, Path)
|
||||
content = bytes(path)
|
||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
||||
cdef bytes bytes_loc = content.encode('utf8') \
|
||||
if type(content) == str else content
|
||||
self._fp = fopen(<char*>bytes_loc, 'wb')
|
||||
if not self._fp:
|
||||
raise IOError(Errors.E146.format(path=path))
|
||||
|
@ -545,14 +592,18 @@ cdef class Writer:
|
|||
cdef size_t status = fclose(self._fp)
|
||||
assert status == 0
|
||||
|
||||
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1:
|
||||
cdef int write_header(
|
||||
self, int64_t nr_entries, int64_t entity_vector_length
|
||||
) except -1:
|
||||
self._write(&nr_entries, sizeof(nr_entries))
|
||||
self._write(&entity_vector_length, sizeof(entity_vector_length))
|
||||
|
||||
cdef int write_vector_element(self, float element) except -1:
|
||||
self._write(&element, sizeof(element))
|
||||
|
||||
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
|
||||
cdef int write_entry(
|
||||
self, hash_t entry_hash, float entry_freq, int32_t vector_index
|
||||
) except -1:
|
||||
self._write(&entry_hash, sizeof(entry_hash))
|
||||
self._write(&entry_freq, sizeof(entry_freq))
|
||||
self._write(&vector_index, sizeof(vector_index))
|
||||
|
@ -561,7 +612,9 @@ cdef class Writer:
|
|||
cdef int write_alias_length(self, int64_t alias_length) except -1:
|
||||
self._write(&alias_length, sizeof(alias_length))
|
||||
|
||||
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1:
|
||||
cdef int write_alias_header(
|
||||
self, hash_t alias_hash, int64_t candidate_length
|
||||
) except -1:
|
||||
self._write(&alias_hash, sizeof(alias_hash))
|
||||
self._write(&candidate_length, sizeof(candidate_length))
|
||||
|
||||
|
@ -577,16 +630,19 @@ cdef class Writer:
|
|||
cdef class Reader:
|
||||
def __init__(self, path):
|
||||
content = bytes(path)
|
||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
||||
cdef bytes bytes_loc = content.encode('utf8') \
|
||||
if type(content) == str else content
|
||||
self._fp = fopen(<char*>bytes_loc, 'rb')
|
||||
if not self._fp:
|
||||
PyErr_SetFromErrno(IOError)
|
||||
status = fseek(self._fp, 0, 0) # this can be 0 if there is no header
|
||||
fseek(self._fp, 0, 0) # this can be 0 if there is no header
|
||||
|
||||
def __dealloc__(self):
|
||||
fclose(self._fp)
|
||||
|
||||
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1:
|
||||
cdef int read_header(
|
||||
self, int64_t* nr_entries, int64_t* entity_vector_length
|
||||
) except -1:
|
||||
status = self._read(nr_entries, sizeof(int64_t))
|
||||
if status < 1:
|
||||
if feof(self._fp):
|
||||
|
@ -606,7 +662,9 @@ cdef class Reader:
|
|||
return 0 # end of file
|
||||
raise IOError(Errors.E145.format(param="vector element"))
|
||||
|
||||
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
|
||||
cdef int read_entry(
|
||||
self, hash_t* entity_hash, float* freq, int32_t* vector_index
|
||||
) except -1:
|
||||
status = self._read(entity_hash, sizeof(hash_t))
|
||||
if status < 1:
|
||||
if feof(self._fp):
|
||||
|
@ -637,7 +695,9 @@ cdef class Reader:
|
|||
return 0 # end of file
|
||||
raise IOError(Errors.E145.format(param="alias length"))
|
||||
|
||||
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1:
|
||||
cdef int read_alias_header(
|
||||
self, hash_t* alias_hash, int64_t* candidate_length
|
||||
) except -1:
|
||||
status = self._read(alias_hash, sizeof(hash_t))
|
||||
if status < 1:
|
||||
if feof(self._fp):
|
||||
|
|
|
@ -1826,7 +1826,6 @@ class Language:
|
|||
# Later we replace the component config with the raw config again.
|
||||
interpolated = filled.interpolate() if not filled.is_interpolated else filled
|
||||
pipeline = interpolated.get("components", {})
|
||||
sourced = util.get_sourced_components(interpolated)
|
||||
# If components are loaded from a source (existing models), we cache
|
||||
# them here so they're only loaded once
|
||||
source_nlps = {}
|
||||
|
@ -1959,7 +1958,7 @@ class Language:
|
|||
useful when training a pipeline with components sourced from an existing
|
||||
pipeline: if multiple components (e.g. tagger, parser, NER) listen to
|
||||
the same tok2vec component, but some of them are frozen and not updated,
|
||||
their performance may degrade significally as the tok2vec component is
|
||||
their performance may degrade significantly as the tok2vec component is
|
||||
updated with new data. To prevent this, listeners can be replaced with
|
||||
a standalone tok2vec layer that is owned by the component and doesn't
|
||||
change if the component isn't updated.
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# cython: embedsignature=True
|
||||
# Compiler crashes on memory view coercion without this. Should report bug.
|
||||
cimport numpy as np
|
||||
from cython.view cimport array as cvarray
|
||||
from libc.string cimport memset
|
||||
|
||||
np.import_array()
|
||||
|
@ -35,7 +34,7 @@ from .typedefs cimport attr_t, flags_t
|
|||
from .attrs import intify_attrs
|
||||
from .errors import Errors, Warnings
|
||||
|
||||
OOV_RANK = 0xffffffffffffffff # UINT64_MAX
|
||||
OOV_RANK = 0xffffffffffffffff # UINT64_MAX
|
||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||
EMPTY_LEXEME.id = OOV_RANK
|
||||
|
||||
|
@ -105,7 +104,7 @@ cdef class Lexeme:
|
|||
if isinstance(value, float):
|
||||
continue
|
||||
elif isinstance(value, (int, long)):
|
||||
Lexeme.set_struct_attr(self.c, attr, value)
|
||||
Lexeme.set_struct_attr(self.c, attr, value)
|
||||
else:
|
||||
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
|
||||
|
||||
|
@ -137,10 +136,12 @@ cdef class Lexeme:
|
|||
if hasattr(other, "orth"):
|
||||
if self.c.orth == other.orth:
|
||||
return 1.0
|
||||
elif hasattr(other, "__len__") and len(other) == 1 \
|
||||
and hasattr(other[0], "orth"):
|
||||
if self.c.orth == other[0].orth:
|
||||
return 1.0
|
||||
elif (
|
||||
hasattr(other, "__len__") and len(other) == 1
|
||||
and hasattr(other[0], "orth")
|
||||
and self.c.orth == other[0].orth
|
||||
):
|
||||
return 1.0
|
||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||
warnings.warn(Warnings.W008.format(obj="Lexeme"))
|
||||
return 0.0
|
||||
|
@ -149,7 +150,7 @@ cdef class Lexeme:
|
|||
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
||||
return result.item()
|
||||
|
||||
|
||||
@property
|
||||
def has_vector(self):
|
||||
"""RETURNS (bool): Whether a word vector is associated with the object.
|
||||
|
|
|
@ -108,7 +108,7 @@ cdef class DependencyMatcher:
|
|||
key (str): The match ID.
|
||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||
"""
|
||||
return self.has_key(key)
|
||||
return self.has_key(key) # no-cython-lint: W601
|
||||
|
||||
def _validate_input(self, pattern, key):
|
||||
idx = 0
|
||||
|
@ -264,7 +264,7 @@ cdef class DependencyMatcher:
|
|||
|
||||
def remove(self, key):
|
||||
key = self._normalize_key(key)
|
||||
if not key in self._patterns:
|
||||
if key not in self._patterns:
|
||||
raise ValueError(Errors.E175.format(key=key))
|
||||
self._patterns.pop(key)
|
||||
self._raw_patterns.pop(key)
|
||||
|
@ -382,7 +382,7 @@ cdef class DependencyMatcher:
|
|||
return []
|
||||
return [doc[node].head]
|
||||
|
||||
def _gov(self,doc,node):
|
||||
def _gov(self, doc, node):
|
||||
return list(doc[node].children)
|
||||
|
||||
def _dep_chain(self, doc, node):
|
||||
|
@ -443,7 +443,7 @@ cdef class DependencyMatcher:
|
|||
|
||||
def _right_child(self, doc, node):
|
||||
return [child for child in doc[node].rights]
|
||||
|
||||
|
||||
def _left_child(self, doc, node):
|
||||
return [child for child in doc[node].lefts]
|
||||
|
||||
|
@ -461,7 +461,7 @@ cdef class DependencyMatcher:
|
|||
if doc[node].head.i > node:
|
||||
return [doc[node].head]
|
||||
return []
|
||||
|
||||
|
||||
def _left_parent(self, doc, node):
|
||||
if doc[node].head.i < node:
|
||||
return [doc[node].head]
|
||||
|
|
|
@ -12,31 +12,18 @@ import warnings
|
|||
|
||||
import srsly
|
||||
|
||||
from ..attrs cimport (
|
||||
DEP,
|
||||
ENT_IOB,
|
||||
ID,
|
||||
LEMMA,
|
||||
MORPH,
|
||||
NULL_ATTR,
|
||||
ORTH,
|
||||
POS,
|
||||
TAG,
|
||||
attr_id_t,
|
||||
)
|
||||
from ..attrs cimport DEP, ENT_IOB, ID, LEMMA, MORPH, NULL_ATTR, POS, TAG
|
||||
from ..structs cimport TokenC
|
||||
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
||||
from ..tokens.morphanalysis cimport MorphAnalysis
|
||||
from ..tokens.span cimport Span
|
||||
from ..tokens.token cimport Token
|
||||
from ..typedefs cimport attr_t
|
||||
from ..vocab cimport Vocab
|
||||
|
||||
from ..attrs import IDS
|
||||
from ..errors import Errors, MatchPatternError, Warnings
|
||||
from ..schemas import validate_token_pattern
|
||||
from ..strings import get_string_id
|
||||
from ..util import registry
|
||||
from .levenshtein import levenshtein_compare
|
||||
|
||||
DEF PADDING = 5
|
||||
|
@ -87,9 +74,9 @@ cdef class Matcher:
|
|||
key (str): The match ID.
|
||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||
"""
|
||||
return self.has_key(key)
|
||||
return self.has_key(key) # no-cython-lint: W601
|
||||
|
||||
def add(self, key, patterns, *, on_match=None, greedy: str=None):
|
||||
def add(self, key, patterns, *, on_match=None, greedy: str = None):
|
||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
||||
key, an on_match callback, and one or more patterns.
|
||||
|
||||
|
@ -143,8 +130,13 @@ cdef class Matcher:
|
|||
key = self._normalize_key(key)
|
||||
for pattern in patterns:
|
||||
try:
|
||||
specs = _preprocess_pattern(pattern, self.vocab,
|
||||
self._extensions, self._extra_predicates, self._fuzzy_compare)
|
||||
specs = _preprocess_pattern(
|
||||
pattern,
|
||||
self.vocab,
|
||||
self._extensions,
|
||||
self._extra_predicates,
|
||||
self._fuzzy_compare
|
||||
)
|
||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||
for spec in specs:
|
||||
for attr, _ in spec[1]:
|
||||
|
@ -168,7 +160,7 @@ cdef class Matcher:
|
|||
key (str): The ID of the match rule.
|
||||
"""
|
||||
norm_key = self._normalize_key(key)
|
||||
if not norm_key in self._patterns:
|
||||
if norm_key not in self._patterns:
|
||||
raise ValueError(Errors.E175.format(key=key))
|
||||
self._patterns.pop(norm_key)
|
||||
self._callbacks.pop(norm_key)
|
||||
|
@ -268,8 +260,15 @@ cdef class Matcher:
|
|||
if self.patterns.empty():
|
||||
matches = []
|
||||
else:
|
||||
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
||||
extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
|
||||
matches = find_matches(
|
||||
&self.patterns[0],
|
||||
self.patterns.size(),
|
||||
doclike,
|
||||
length,
|
||||
extensions=self._extensions,
|
||||
predicates=self._extra_predicates,
|
||||
with_alignments=with_alignments
|
||||
)
|
||||
final_matches = []
|
||||
pairs_by_id = {}
|
||||
# For each key, either add all matches, or only the filtered,
|
||||
|
@ -289,9 +288,9 @@ cdef class Matcher:
|
|||
memset(matched, 0, length * sizeof(matched[0]))
|
||||
span_filter = self._filter.get(key)
|
||||
if span_filter == "FIRST":
|
||||
sorted_pairs = sorted(pairs, key=lambda x: (x[0], -x[1]), reverse=False) # sort by start
|
||||
sorted_pairs = sorted(pairs, key=lambda x: (x[0], -x[1]), reverse=False) # sort by start
|
||||
elif span_filter == "LONGEST":
|
||||
sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True) # reverse sort by length
|
||||
sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True) # reverse sort by length
|
||||
else:
|
||||
raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=span_filter))
|
||||
for match in sorted_pairs:
|
||||
|
@ -366,7 +365,6 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
|||
cdef vector[MatchC] matches
|
||||
cdef vector[vector[MatchAlignmentC]] align_states
|
||||
cdef vector[vector[MatchAlignmentC]] align_matches
|
||||
cdef PatternStateC state
|
||||
cdef int i, j, nr_extra_attr
|
||||
cdef Pool mem = Pool()
|
||||
output = []
|
||||
|
@ -388,14 +386,22 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
|||
value = token.vocab.strings[value]
|
||||
extra_attr_values[i * nr_extra_attr + index] = value
|
||||
# Main loop
|
||||
cdef int nr_predicate = len(predicates)
|
||||
for i in range(length):
|
||||
for j in range(n):
|
||||
states.push_back(PatternStateC(patterns[j], i, 0))
|
||||
if with_alignments != 0:
|
||||
align_states.resize(states.size())
|
||||
transition_states(states, matches, align_states, align_matches, predicate_cache,
|
||||
doclike[i], extra_attr_values, predicates, with_alignments)
|
||||
transition_states(
|
||||
states,
|
||||
matches,
|
||||
align_states,
|
||||
align_matches,
|
||||
predicate_cache,
|
||||
doclike[i],
|
||||
extra_attr_values,
|
||||
predicates,
|
||||
with_alignments
|
||||
)
|
||||
extra_attr_values += nr_extra_attr
|
||||
predicate_cache += len(predicates)
|
||||
# Handle matches that end in 0-width patterns
|
||||
|
@ -421,18 +427,28 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
|||
return output
|
||||
|
||||
|
||||
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
||||
vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches,
|
||||
int8_t* cached_py_predicates,
|
||||
Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *:
|
||||
cdef void transition_states(
|
||||
vector[PatternStateC]& states,
|
||||
vector[MatchC]& matches,
|
||||
vector[vector[MatchAlignmentC]]& align_states,
|
||||
vector[vector[MatchAlignmentC]]& align_matches,
|
||||
int8_t* cached_py_predicates,
|
||||
Token token,
|
||||
const attr_t* extra_attrs,
|
||||
py_predicates,
|
||||
bint with_alignments
|
||||
) except *:
|
||||
cdef int q = 0
|
||||
cdef vector[PatternStateC] new_states
|
||||
cdef vector[vector[MatchAlignmentC]] align_new_states
|
||||
cdef int nr_predicate = len(py_predicates)
|
||||
for i in range(states.size()):
|
||||
if states[i].pattern.nr_py >= 1:
|
||||
update_predicate_cache(cached_py_predicates,
|
||||
states[i].pattern, token, py_predicates)
|
||||
update_predicate_cache(
|
||||
cached_py_predicates,
|
||||
states[i].pattern,
|
||||
token,
|
||||
py_predicates
|
||||
)
|
||||
action = get_action(states[i], token.c, extra_attrs,
|
||||
cached_py_predicates)
|
||||
if action == REJECT:
|
||||
|
@ -468,8 +484,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
|||
align_new_states.push_back(align_states[q])
|
||||
states[q].pattern += 1
|
||||
if states[q].pattern.nr_py != 0:
|
||||
update_predicate_cache(cached_py_predicates,
|
||||
states[q].pattern, token, py_predicates)
|
||||
update_predicate_cache(
|
||||
cached_py_predicates,
|
||||
states[q].pattern,
|
||||
token,
|
||||
py_predicates
|
||||
)
|
||||
action = get_action(states[q], token.c, extra_attrs,
|
||||
cached_py_predicates)
|
||||
# Update alignment before the transition of current state
|
||||
|
@ -485,8 +505,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
|||
ent_id = get_ent_id(state.pattern)
|
||||
if action == MATCH:
|
||||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start,
|
||||
length=state.length+1))
|
||||
MatchC(
|
||||
pattern_id=ent_id,
|
||||
start=state.start,
|
||||
length=state.length+1
|
||||
)
|
||||
)
|
||||
# `align_matches` always corresponds to `matches` 1:1
|
||||
if with_alignments != 0:
|
||||
align_matches.push_back(align_states[q])
|
||||
|
@ -494,23 +518,35 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
|||
# push match without last token if length > 0
|
||||
if state.length > 0:
|
||||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start,
|
||||
length=state.length))
|
||||
MatchC(
|
||||
pattern_id=ent_id,
|
||||
start=state.start,
|
||||
length=state.length
|
||||
)
|
||||
)
|
||||
# MATCH_DOUBLE emits matches twice,
|
||||
# add one more to align_matches in order to keep 1:1 relationship
|
||||
if with_alignments != 0:
|
||||
align_matches.push_back(align_states[q])
|
||||
# push match with last token
|
||||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start,
|
||||
length=state.length+1))
|
||||
MatchC(
|
||||
pattern_id=ent_id,
|
||||
start=state.start,
|
||||
length=state.length + 1
|
||||
)
|
||||
)
|
||||
# `align_matches` always corresponds to `matches` 1:1
|
||||
if with_alignments != 0:
|
||||
align_matches.push_back(align_states[q])
|
||||
elif action == MATCH_REJECT:
|
||||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start,
|
||||
length=state.length))
|
||||
MatchC(
|
||||
pattern_id=ent_id,
|
||||
start=state.start,
|
||||
length=state.length
|
||||
)
|
||||
)
|
||||
# `align_matches` always corresponds to `matches` 1:1
|
||||
if with_alignments != 0:
|
||||
align_matches.push_back(align_states[q])
|
||||
|
@ -533,8 +569,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
|||
align_states.push_back(align_new_states[i])
|
||||
|
||||
|
||||
cdef int update_predicate_cache(int8_t* cache,
|
||||
const TokenPatternC* pattern, Token token, predicates) except -1:
|
||||
cdef int update_predicate_cache(
|
||||
int8_t* cache,
|
||||
const TokenPatternC* pattern,
|
||||
Token token,
|
||||
predicates
|
||||
) except -1:
|
||||
# If the state references any extra predicates, check whether they match.
|
||||
# These are cached, so that we don't call these potentially expensive
|
||||
# Python functions more than we need to.
|
||||
|
@ -580,10 +620,12 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
|
|||
else:
|
||||
state.pattern += 1
|
||||
|
||||
|
||||
cdef action_t get_action(PatternStateC state,
|
||||
const TokenC* token, const attr_t* extra_attrs,
|
||||
const int8_t* predicate_matches) nogil:
|
||||
cdef action_t get_action(
|
||||
PatternStateC state,
|
||||
const TokenC * token,
|
||||
const attr_t * extra_attrs,
|
||||
const int8_t * predicate_matches
|
||||
) nogil:
|
||||
"""We need to consider:
|
||||
a) Does the token match the specification? [Yes, No]
|
||||
b) What's the quantifier? [1, 0+, ?]
|
||||
|
@ -649,53 +691,56 @@ cdef action_t get_action(PatternStateC state,
|
|||
is_match = not is_match
|
||||
quantifier = ONE
|
||||
if quantifier == ONE:
|
||||
if is_match and is_final:
|
||||
# Yes, final: 1000
|
||||
return MATCH
|
||||
elif is_match and not is_final:
|
||||
# Yes, non-final: 0100
|
||||
return ADVANCE
|
||||
elif not is_match and is_final:
|
||||
# No, final: 0000
|
||||
return REJECT
|
||||
else:
|
||||
return REJECT
|
||||
if is_match and is_final:
|
||||
# Yes, final: 1000
|
||||
return MATCH
|
||||
elif is_match and not is_final:
|
||||
# Yes, non-final: 0100
|
||||
return ADVANCE
|
||||
elif not is_match and is_final:
|
||||
# No, final: 0000
|
||||
return REJECT
|
||||
else:
|
||||
return REJECT
|
||||
elif quantifier == ZERO_PLUS:
|
||||
if is_match and is_final:
|
||||
# Yes, final: 1001
|
||||
return MATCH_EXTEND
|
||||
elif is_match and not is_final:
|
||||
# Yes, non-final: 0011
|
||||
return RETRY_EXTEND
|
||||
elif not is_match and is_final:
|
||||
# No, final 2000 (note: Don't include last token!)
|
||||
return MATCH_REJECT
|
||||
else:
|
||||
# No, non-final 0010
|
||||
return RETRY
|
||||
if is_match and is_final:
|
||||
# Yes, final: 1001
|
||||
return MATCH_EXTEND
|
||||
elif is_match and not is_final:
|
||||
# Yes, non-final: 0011
|
||||
return RETRY_EXTEND
|
||||
elif not is_match and is_final:
|
||||
# No, final 2000 (note: Don't include last token!)
|
||||
return MATCH_REJECT
|
||||
else:
|
||||
# No, non-final 0010
|
||||
return RETRY
|
||||
elif quantifier == ZERO_ONE:
|
||||
if is_match and is_final:
|
||||
# Yes, final: 3000
|
||||
# To cater for a pattern ending in "?", we need to add
|
||||
# a match both with and without the last token
|
||||
return MATCH_DOUBLE
|
||||
elif is_match and not is_final:
|
||||
# Yes, non-final: 0110
|
||||
# We need both branches here, consider a pair like:
|
||||
# pattern: .?b string: b
|
||||
# If we 'ADVANCE' on the .?, we miss the match.
|
||||
return RETRY_ADVANCE
|
||||
elif not is_match and is_final:
|
||||
# No, final 2000 (note: Don't include last token!)
|
||||
return MATCH_REJECT
|
||||
else:
|
||||
# No, non-final 0010
|
||||
return RETRY
|
||||
if is_match and is_final:
|
||||
# Yes, final: 3000
|
||||
# To cater for a pattern ending in "?", we need to add
|
||||
# a match both with and without the last token
|
||||
return MATCH_DOUBLE
|
||||
elif is_match and not is_final:
|
||||
# Yes, non-final: 0110
|
||||
# We need both branches here, consider a pair like:
|
||||
# pattern: .?b string: b
|
||||
# If we 'ADVANCE' on the .?, we miss the match.
|
||||
return RETRY_ADVANCE
|
||||
elif not is_match and is_final:
|
||||
# No, final 2000 (note: Don't include last token!)
|
||||
return MATCH_REJECT
|
||||
else:
|
||||
# No, non-final 0010
|
||||
return RETRY
|
||||
|
||||
|
||||
cdef int8_t get_is_match(PatternStateC state,
|
||||
const TokenC* token, const attr_t* extra_attrs,
|
||||
const int8_t* predicate_matches) nogil:
|
||||
cdef int8_t get_is_match(
|
||||
PatternStateC state,
|
||||
const TokenC* token,
|
||||
const attr_t* extra_attrs,
|
||||
const int8_t* predicate_matches
|
||||
) nogil:
|
||||
for i in range(state.pattern.nr_py):
|
||||
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
||||
return 0
|
||||
|
@ -860,7 +905,7 @@ class _FuzzyPredicate:
|
|||
self.is_extension = is_extension
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
fuzz = self.predicate[len("FUZZY"):] # number after prefix
|
||||
fuzz = self.predicate[len("FUZZY"):] # number after prefix
|
||||
self.fuzzy = int(fuzz) if fuzz else -1
|
||||
self.fuzzy_compare = fuzzy_compare
|
||||
self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy)
|
||||
|
@ -1082,7 +1127,7 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
|
|||
elif cls == _FuzzyPredicate:
|
||||
if isinstance(value, dict):
|
||||
# add predicates inside fuzzy operator
|
||||
fuzz = type_[len("FUZZY"):] # number after prefix
|
||||
fuzz = type_[len("FUZZY"):] # number after prefix
|
||||
fuzzy_val = int(fuzz) if fuzz else -1
|
||||
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
||||
extra_predicates, seen_predicates,
|
||||
|
@ -1101,8 +1146,9 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
|
|||
return output
|
||||
|
||||
|
||||
def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
|
||||
seen_predicates):
|
||||
def _get_extension_extra_predicates(
|
||||
spec, extra_predicates, predicate_types, seen_predicates
|
||||
):
|
||||
output = []
|
||||
for attr, value in spec.items():
|
||||
if isinstance(value, dict):
|
||||
|
@ -1131,7 +1177,7 @@ def _get_operators(spec):
|
|||
return (ONE,)
|
||||
elif spec["OP"] in lookup:
|
||||
return lookup[spec["OP"]]
|
||||
#Min_max {n,m}
|
||||
# Min_max {n,m}
|
||||
elif spec["OP"].startswith("{") and spec["OP"].endswith("}"):
|
||||
# {n} --> {n,n} exactly n ONE,(n)
|
||||
# {n,m}--> {n,m} min of n, max of m ONE,(n),ZERO_ONE,(m)
|
||||
|
@ -1142,8 +1188,8 @@ def _get_operators(spec):
|
|||
min_max = min_max if "," in min_max else f"{min_max},{min_max}"
|
||||
n, m = min_max.split(",")
|
||||
|
||||
#1. Either n or m is a blank string and the other is numeric -->isdigit
|
||||
#2. Both are numeric and n <= m
|
||||
# 1. Either n or m is a blank string and the other is numeric -->isdigit
|
||||
# 2. Both are numeric and n <= m
|
||||
if (not n.isdecimal() and not m.isdecimal()) or (n.isdecimal() and m.isdecimal() and int(n) > int(m)):
|
||||
keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
|
||||
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
|
||||
|
|
|
@ -1,14 +1,12 @@
|
|||
# cython: infer_types=True, profile=True
|
||||
from libc.stdint cimport uintptr_t
|
||||
from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
|
||||
|
||||
import warnings
|
||||
|
||||
from ..attrs cimport DEP, LEMMA, MORPH, ORTH, POS, TAG
|
||||
from ..attrs cimport DEP, LEMMA, MORPH, POS, TAG
|
||||
|
||||
from ..attrs import IDS
|
||||
|
||||
from ..structs cimport TokenC
|
||||
from ..tokens.span cimport Span
|
||||
from ..tokens.token cimport Token
|
||||
from ..typedefs cimport attr_t
|
||||
|
|
|
@ -40,11 +40,16 @@ cdef ActivationsC alloc_activations(SizesC n) nogil
|
|||
|
||||
cdef void free_activations(const ActivationsC* A) nogil
|
||||
|
||||
cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
|
||||
const WeightsC* W, SizesC n) nogil
|
||||
|
||||
cdef void predict_states(
|
||||
CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
|
||||
) nogil
|
||||
|
||||
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
|
||||
|
||||
cdef void cpu_log_loss(float* d_scores,
|
||||
const float* costs, const int* is_valid, const float* scores, int O) nogil
|
||||
|
||||
cdef void cpu_log_loss(
|
||||
float* d_scores,
|
||||
const float* costs,
|
||||
const int* is_valid,
|
||||
const float* scores,
|
||||
int O
|
||||
) nogil
|
||||
|
|
|
@ -8,13 +8,13 @@ from thinc.backends.linalg cimport Vec, VecVec
|
|||
|
||||
import numpy
|
||||
import numpy.random
|
||||
from thinc.api import CupyOps, Model, NumpyOps, get_ops
|
||||
from thinc.api import CupyOps, Model, NumpyOps
|
||||
|
||||
from .. import util
|
||||
from ..errors import Errors
|
||||
|
||||
from ..pipeline._parser_internals.stateclass cimport StateClass
|
||||
from ..typedefs cimport class_t, hash_t, weight_t
|
||||
from ..typedefs cimport weight_t
|
||||
|
||||
|
||||
cdef WeightsC get_c_weights(model) except *:
|
||||
|
@ -78,33 +78,48 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
|
|||
A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
|
||||
A._max_size = n.states
|
||||
else:
|
||||
A.token_ids = <int*>realloc(A.token_ids,
|
||||
n.states * n.feats * sizeof(A.token_ids[0]))
|
||||
A.scores = <float*>realloc(A.scores,
|
||||
n.states * n.classes * sizeof(A.scores[0]))
|
||||
A.unmaxed = <float*>realloc(A.unmaxed,
|
||||
n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
|
||||
A.hiddens = <float*>realloc(A.hiddens,
|
||||
n.states * n.hiddens * sizeof(A.hiddens[0]))
|
||||
A.is_valid = <int*>realloc(A.is_valid,
|
||||
n.states * n.classes * sizeof(A.is_valid[0]))
|
||||
A.token_ids = <int*>realloc(
|
||||
A.token_ids, n.states * n.feats * sizeof(A.token_ids[0])
|
||||
)
|
||||
A.scores = <float*>realloc(
|
||||
A.scores, n.states * n.classes * sizeof(A.scores[0])
|
||||
)
|
||||
A.unmaxed = <float*>realloc(
|
||||
A.unmaxed, n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0])
|
||||
)
|
||||
A.hiddens = <float*>realloc(
|
||||
A.hiddens, n.states * n.hiddens * sizeof(A.hiddens[0])
|
||||
)
|
||||
A.is_valid = <int*>realloc(
|
||||
A.is_valid, n.states * n.classes * sizeof(A.is_valid[0])
|
||||
)
|
||||
A._max_size = n.states
|
||||
A._curr_size = n.states
|
||||
|
||||
|
||||
cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
|
||||
const WeightsC* W, SizesC n) nogil:
|
||||
cdef double one = 1.0
|
||||
cdef void predict_states(
|
||||
CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
|
||||
) nogil:
|
||||
resize_activations(A, n)
|
||||
for i in range(n.states):
|
||||
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
||||
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
|
||||
memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
|
||||
sum_state_features(cblas, A.unmaxed,
|
||||
W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
|
||||
sum_state_features(
|
||||
cblas,
|
||||
A.unmaxed,
|
||||
W.feat_weights,
|
||||
A.token_ids,
|
||||
n.states,
|
||||
n.feats,
|
||||
n.hiddens * n.pieces
|
||||
)
|
||||
for i in range(n.states):
|
||||
VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces],
|
||||
W.feat_bias, 1., n.hiddens * n.pieces)
|
||||
VecVec.add_i(
|
||||
&A.unmaxed[i*n.hiddens*n.pieces],
|
||||
W.feat_bias, 1.,
|
||||
n.hiddens * n.pieces
|
||||
)
|
||||
for j in range(n.hiddens):
|
||||
index = i * n.hiddens * n.pieces + j * n.pieces
|
||||
which = Vec.arg_max(&A.unmaxed[index], n.pieces)
|
||||
|
@ -114,14 +129,15 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
|
|||
memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
|
||||
else:
|
||||
# Compute hidden-to-output
|
||||
sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
|
||||
sgemm(cblas)(
|
||||
False, True, n.states, n.classes, n.hiddens,
|
||||
1.0, <const float *>A.hiddens, n.hiddens,
|
||||
<const float *>W.hidden_weights, n.hiddens,
|
||||
0.0, A.scores, n.classes)
|
||||
0.0, A.scores, n.classes
|
||||
)
|
||||
# Add bias
|
||||
for i in range(n.states):
|
||||
VecVec.add_i(&A.scores[i*n.classes],
|
||||
W.hidden_bias, 1., n.classes)
|
||||
VecVec.add_i(&A.scores[i*n.classes], W.hidden_bias, 1., n.classes)
|
||||
# Set unseen classes to minimum value
|
||||
i = 0
|
||||
min_ = A.scores[0]
|
||||
|
@ -134,9 +150,16 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
|
|||
A.scores[i*n.classes+j] = min_
|
||||
|
||||
|
||||
cdef void sum_state_features(CBlas cblas, float* output,
|
||||
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
||||
cdef int idx, b, f, i
|
||||
cdef void sum_state_features(
|
||||
CBlas cblas,
|
||||
float* output,
|
||||
const float* cached,
|
||||
const int* token_ids,
|
||||
int B,
|
||||
int F,
|
||||
int O
|
||||
) nogil:
|
||||
cdef int idx, b, f
|
||||
cdef const float* feature
|
||||
padding = cached
|
||||
cached += F * O
|
||||
|
@ -153,9 +176,13 @@ cdef void sum_state_features(CBlas cblas, float* output,
|
|||
token_ids += F
|
||||
|
||||
|
||||
cdef void cpu_log_loss(float* d_scores,
|
||||
const float* costs, const int* is_valid, const float* scores,
|
||||
int O) nogil:
|
||||
cdef void cpu_log_loss(
|
||||
float* d_scores,
|
||||
const float* costs,
|
||||
const int* is_valid,
|
||||
const float* scores,
|
||||
int O
|
||||
) nogil:
|
||||
"""Do multi-label log loss"""
|
||||
cdef double max_, gmax, Z, gZ
|
||||
best = arg_max_if_gold(scores, costs, is_valid, O)
|
||||
|
@ -179,8 +206,9 @@ cdef void cpu_log_loss(float* d_scores,
|
|||
d_scores[i] = exp(scores[i]-max_) / Z
|
||||
|
||||
|
||||
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
|
||||
const int* is_valid, int n) nogil:
|
||||
cdef int arg_max_if_gold(
|
||||
const weight_t* scores, const weight_t* costs, const int* is_valid, int n
|
||||
) nogil:
|
||||
# Find minimum cost
|
||||
cdef float cost = 1
|
||||
for i in range(n):
|
||||
|
@ -204,10 +232,17 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
|
|||
return best
|
||||
|
||||
|
||||
|
||||
class ParserStepModel(Model):
|
||||
def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
|
||||
dropout=0.1):
|
||||
def __init__(
|
||||
self,
|
||||
docs,
|
||||
layers,
|
||||
*,
|
||||
has_upper,
|
||||
unseen_classes=None,
|
||||
train=True,
|
||||
dropout=0.1
|
||||
):
|
||||
Model.__init__(self, name="parser_step_model", forward=step_forward)
|
||||
self.attrs["has_upper"] = has_upper
|
||||
self.attrs["dropout_rate"] = dropout
|
||||
|
@ -268,8 +303,10 @@ class ParserStepModel(Model):
|
|||
return ids
|
||||
|
||||
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
|
||||
if isinstance(self.state2vec.ops, CupyOps) \
|
||||
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
|
||||
if (
|
||||
isinstance(self.state2vec.ops, CupyOps)
|
||||
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray)
|
||||
):
|
||||
# Move token_ids and d_vector to GPU, asynchronously
|
||||
self.backprops.append((
|
||||
util.get_async(self.cuda_stream, token_ids),
|
||||
|
@ -279,7 +316,6 @@ class ParserStepModel(Model):
|
|||
else:
|
||||
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
|
||||
|
||||
|
||||
def finish_steps(self, golds):
|
||||
# Add a padding vector to the d_tokvecs gradient, so that missing
|
||||
# values don't affect the real gradient.
|
||||
|
@ -292,14 +328,15 @@ class ParserStepModel(Model):
|
|||
ids = ids.flatten()
|
||||
d_state_features = d_state_features.reshape(
|
||||
(ids.size, d_state_features.shape[2]))
|
||||
self.ops.scatter_add(d_tokvecs, ids,
|
||||
d_state_features)
|
||||
self.ops.scatter_add(d_tokvecs, ids, d_state_features)
|
||||
# Padded -- see update()
|
||||
self.bp_tokvecs(d_tokvecs[:-1])
|
||||
return d_tokvecs
|
||||
|
||||
|
||||
NUMPY_OPS = NumpyOps()
|
||||
|
||||
|
||||
def step_forward(model: ParserStepModel, states, is_train):
|
||||
token_ids = model.get_token_ids(states)
|
||||
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
|
||||
|
@ -312,7 +349,7 @@ def step_forward(model: ParserStepModel, states, is_train):
|
|||
scores, get_d_vector = model.vec2scores(vector, is_train)
|
||||
else:
|
||||
scores = NumpyOps().asarray(vector)
|
||||
get_d_vector = lambda d_scores: d_scores
|
||||
get_d_vector = lambda d_scores: d_scores # no-cython-lint: E731
|
||||
# If the class is unseen, make sure its score is minimum
|
||||
scores[:, model._class_mask == 0] = numpy.nanmin(scores)
|
||||
|
||||
|
@ -448,9 +485,11 @@ cdef class precompute_hiddens:
|
|||
|
||||
feat_weights = self.get_feat_weights()
|
||||
cdef int[:, ::1] ids = token_ids
|
||||
sum_state_features(cblas, <float*>state_vector.data,
|
||||
feat_weights, &ids[0,0],
|
||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||
sum_state_features(
|
||||
cblas, <float*>state_vector.data,
|
||||
feat_weights, &ids[0, 0],
|
||||
token_ids.shape[0], self.nF, self.nO*self.nP
|
||||
)
|
||||
state_vector += self.bias
|
||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
||||
|
||||
|
@ -475,7 +514,7 @@ cdef class precompute_hiddens:
|
|||
|
||||
def backprop_maxout(d_best):
|
||||
return self.ops.backprop_maxout(d_best, mask, self.nP)
|
||||
|
||||
|
||||
return state_vector, backprop_maxout
|
||||
|
||||
def _relu_nonlinearity(self, state_vector):
|
||||
|
@ -489,5 +528,5 @@ cdef class precompute_hiddens:
|
|||
def backprop_relu(d_best):
|
||||
d_best *= mask
|
||||
return d_best.reshape((d_best.shape + (1,)))
|
||||
|
||||
|
||||
return state_vector, backprop_relu
|
||||
|
|
|
@ -11,7 +11,7 @@ from .typedefs cimport attr_t, hash_t
|
|||
cdef class Morphology:
|
||||
cdef readonly Pool mem
|
||||
cdef readonly StringStore strings
|
||||
cdef PreshMap tags # Keyed by hash, value is pointer to tag
|
||||
cdef PreshMap tags # Keyed by hash, value is pointer to tag
|
||||
|
||||
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
|
||||
cdef int insert(self, MorphAnalysisC tag) except -1
|
||||
|
@ -20,4 +20,8 @@ cdef class Morphology:
|
|||
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
|
||||
cdef list list_features(const MorphAnalysisC* morph)
|
||||
cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
|
||||
cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil
|
||||
cdef int get_n_by_field(
|
||||
attr_t* results,
|
||||
const MorphAnalysisC* morph,
|
||||
attr_t field,
|
||||
) nogil
|
||||
|
|
|
@ -83,10 +83,11 @@ cdef class Morphology:
|
|||
features = self.normalize_attrs(features)
|
||||
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
|
||||
# normalized UFEATS string with sorted fields and values
|
||||
norm_feats_string = self.FEATURE_SEP.join(sorted([
|
||||
self.FIELD_SEP.join([field, values])
|
||||
for field, values in string_features.items()
|
||||
]))
|
||||
norm_feats_string = self.FEATURE_SEP.join(
|
||||
sorted(
|
||||
[self.FIELD_SEP.join([field, values]) for field, values in string_features.items()]
|
||||
)
|
||||
)
|
||||
return norm_feats_string or self.EMPTY_MORPH
|
||||
|
||||
def normalize_attrs(self, attrs):
|
||||
|
@ -192,6 +193,7 @@ cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t fie
|
|||
n_results += 1
|
||||
return n_results
|
||||
|
||||
|
||||
def unpickle_morphology(strings, tags):
|
||||
cdef Morphology morphology = Morphology(strings)
|
||||
for tag in tags:
|
||||
|
|
|
@ -8,7 +8,7 @@ cpdef enum univ_pos_t:
|
|||
ADV
|
||||
AUX
|
||||
CONJ
|
||||
CCONJ # U20
|
||||
CCONJ # U20
|
||||
DET
|
||||
INTJ
|
||||
NOUN
|
||||
|
|
|
@ -46,11 +46,18 @@ cdef struct EditTreeC:
|
|||
bint is_match_node
|
||||
NodeC inner
|
||||
|
||||
cdef inline EditTreeC edittree_new_match(len_t prefix_len, len_t suffix_len,
|
||||
uint32_t prefix_tree, uint32_t suffix_tree):
|
||||
cdef MatchNodeC match_node = MatchNodeC(prefix_len=prefix_len,
|
||||
suffix_len=suffix_len, prefix_tree=prefix_tree,
|
||||
suffix_tree=suffix_tree)
|
||||
cdef inline EditTreeC edittree_new_match(
|
||||
len_t prefix_len,
|
||||
len_t suffix_len,
|
||||
uint32_t prefix_tree,
|
||||
uint32_t suffix_tree
|
||||
):
|
||||
cdef MatchNodeC match_node = MatchNodeC(
|
||||
prefix_len=prefix_len,
|
||||
suffix_len=suffix_len,
|
||||
prefix_tree=prefix_tree,
|
||||
suffix_tree=suffix_tree
|
||||
)
|
||||
cdef NodeC inner = NodeC(match_node=match_node)
|
||||
return EditTreeC(is_match_node=True, inner=inner)
|
||||
|
||||
|
|
|
@ -5,8 +5,6 @@ from libc.string cimport memset
|
|||
from libcpp.pair cimport pair
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from ...typedefs cimport hash_t
|
||||
|
||||
from ... import util
|
||||
|
@ -25,17 +23,16 @@ cdef LCS find_lcs(str source, str target):
|
|||
target (str): The second string.
|
||||
RETURNS (LCS): The spans of the longest common subsequences.
|
||||
"""
|
||||
cdef Py_ssize_t source_len = len(source)
|
||||
cdef Py_ssize_t target_len = len(target)
|
||||
cdef size_t longest_align = 0;
|
||||
cdef size_t longest_align = 0
|
||||
cdef int source_idx, target_idx
|
||||
cdef LCS lcs
|
||||
cdef Py_UCS4 source_cp, target_cp
|
||||
|
||||
memset(&lcs, 0, sizeof(lcs))
|
||||
|
||||
cdef vector[size_t] prev_aligns = vector[size_t](target_len);
|
||||
cdef vector[size_t] cur_aligns = vector[size_t](target_len);
|
||||
cdef vector[size_t] prev_aligns = vector[size_t](target_len)
|
||||
cdef vector[size_t] cur_aligns = vector[size_t](target_len)
|
||||
|
||||
for (source_idx, source_cp) in enumerate(source):
|
||||
for (target_idx, target_cp) in enumerate(target):
|
||||
|
@ -89,7 +86,7 @@ cdef class EditTrees:
|
|||
cdef LCS lcs = find_lcs(form, lemma)
|
||||
|
||||
cdef EditTreeC tree
|
||||
cdef uint32_t tree_id, prefix_tree, suffix_tree
|
||||
cdef uint32_t prefix_tree, suffix_tree
|
||||
if lcs_is_empty(lcs):
|
||||
tree = edittree_new_subst(self.strings.add(form), self.strings.add(lemma))
|
||||
else:
|
||||
|
@ -108,7 +105,7 @@ cdef class EditTrees:
|
|||
return self._tree_id(tree)
|
||||
|
||||
cdef uint32_t _tree_id(self, EditTreeC tree):
|
||||
# If this tree has been constructed before, return its identifier.
|
||||
# If this tree has been constructed before, return its identifier.
|
||||
cdef hash_t hash = edittree_hash(tree)
|
||||
cdef unordered_map[hash_t, uint32_t].iterator iter = self.map.find(hash)
|
||||
if iter != self.map.end():
|
||||
|
@ -289,6 +286,7 @@ def _tree2dict(tree):
|
|||
tree = tree["inner"]["subst_node"]
|
||||
return(dict(tree))
|
||||
|
||||
|
||||
def _dict2tree(tree):
|
||||
errors = validate_edit_tree(tree)
|
||||
if errors:
|
||||
|
|
|
@ -1,17 +1,14 @@
|
|||
# cython: infer_types=True
|
||||
# cython: profile=True
|
||||
cimport numpy as np
|
||||
|
||||
import numpy
|
||||
|
||||
from cpython.ref cimport Py_XDECREF, PyObject
|
||||
from thinc.extra.search cimport Beam
|
||||
|
||||
from thinc.extra.search import MaxViolation
|
||||
|
||||
from thinc.extra.search cimport MaxViolation
|
||||
|
||||
from ...typedefs cimport class_t, hash_t
|
||||
from ...typedefs cimport class_t
|
||||
from .transition_system cimport Transition, TransitionSystem
|
||||
|
||||
from ...errors import Errors
|
||||
|
@ -146,7 +143,6 @@ def update_beam(TransitionSystem moves, states, golds, model, int width, beam_de
|
|||
cdef MaxViolation violn
|
||||
pbeam = BeamBatch(moves, states, golds, width=width, density=beam_density)
|
||||
gbeam = BeamBatch(moves, states, golds, width=width, density=0.0)
|
||||
cdef StateClass state
|
||||
beam_maps = []
|
||||
backprops = []
|
||||
violns = [MaxViolation() for _ in range(len(states))]
|
||||
|
|
|
@ -277,7 +277,6 @@ cdef cppclass StateC:
|
|||
|
||||
return n
|
||||
|
||||
|
||||
int n_L(int head) nogil const:
|
||||
return n_arcs(this._left_arcs, head)
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ from ...strings cimport hash_string
|
|||
from ...structs cimport TokenC
|
||||
from ...tokens.doc cimport Doc, set_children_from_heads
|
||||
from ...tokens.token cimport MISSING_DEP
|
||||
from ...typedefs cimport attr_t, hash_t
|
||||
from ...typedefs cimport attr_t
|
||||
|
||||
from ...training import split_bilu_label
|
||||
|
||||
|
@ -68,8 +68,9 @@ cdef struct GoldParseStateC:
|
|||
weight_t pop_cost
|
||||
|
||||
|
||||
cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state,
|
||||
heads, labels, sent_starts) except *:
|
||||
cdef GoldParseStateC create_gold_state(
|
||||
Pool mem, const StateC* state, heads, labels, sent_starts
|
||||
) except *:
|
||||
cdef GoldParseStateC gs
|
||||
gs.length = len(heads)
|
||||
gs.stride = 1
|
||||
|
@ -82,7 +83,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state,
|
|||
gs.n_kids_in_stack = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0]))
|
||||
|
||||
for i, is_sent_start in enumerate(sent_starts):
|
||||
if is_sent_start == True:
|
||||
if is_sent_start is True:
|
||||
gs.state_bits[i] = set_state_flag(
|
||||
gs.state_bits[i],
|
||||
IS_SENT_START,
|
||||
|
@ -210,6 +211,7 @@ cdef class ArcEagerGold:
|
|||
def update(self, StateClass stcls):
|
||||
update_gold_state(&self.c, stcls.c)
|
||||
|
||||
|
||||
def _get_aligned_sent_starts(example):
|
||||
"""Get list of SENT_START attributes aligned to the predicted tokenization.
|
||||
If the reference has not sentence starts, return a list of None values.
|
||||
|
@ -524,7 +526,6 @@ cdef class Break:
|
|||
"""
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
cdef int i
|
||||
if st.buffer_length() < 2:
|
||||
return False
|
||||
elif st.B(1) != st.B(0) + 1:
|
||||
|
@ -556,8 +557,8 @@ cdef class Break:
|
|||
cost -= 1
|
||||
if gold.heads[si] == b0:
|
||||
cost -= 1
|
||||
if not is_sent_start(gold, state.B(1)) \
|
||||
and not is_sent_start_unknown(gold, state.B(1)):
|
||||
if not is_sent_start(gold, state.B(1)) and\
|
||||
not is_sent_start_unknown(gold, state.B(1)):
|
||||
cost += 1
|
||||
return cost
|
||||
|
||||
|
@ -803,7 +804,6 @@ cdef class ArcEager(TransitionSystem):
|
|||
raise TypeError(Errors.E909.format(name="ArcEagerGold"))
|
||||
cdef ArcEagerGold gold_ = gold
|
||||
gold_state = gold_.c
|
||||
n_gold = 0
|
||||
if self.c[i].is_valid(stcls.c, self.c[i].label):
|
||||
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
|
||||
else:
|
||||
|
@ -875,7 +875,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
print("Gold")
|
||||
for token in example.y:
|
||||
print(token.i, token.text, token.dep_, token.head.text)
|
||||
aligned_heads, aligned_labels = example.get_aligned_parse()
|
||||
aligned_heads, _aligned_labels = example.get_aligned_parse()
|
||||
print("Aligned heads")
|
||||
for i, head in enumerate(aligned_heads):
|
||||
print(example.x[i], example.x[head] if head is not None else "__")
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
import os
|
||||
import random
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from libc.stdint cimport int32_t
|
||||
|
||||
|
@ -14,7 +11,7 @@ from ...tokens.span import Span
|
|||
|
||||
from ...attrs cimport IS_SPACE
|
||||
from ...lexeme cimport Lexeme
|
||||
from ...structs cimport SpanC, TokenC
|
||||
from ...structs cimport SpanC
|
||||
from ...tokens.span cimport Span
|
||||
from ...typedefs cimport attr_t, weight_t
|
||||
|
||||
|
@ -141,11 +138,10 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
OUT: Counter()
|
||||
}
|
||||
actions[OUT][''] = 1 # Represents a token predicted to be outside of any entity
|
||||
actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity
|
||||
actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity
|
||||
for entity_type in kwargs.get('entity_types', []):
|
||||
for action in (BEGIN, IN, LAST, UNIT):
|
||||
actions[action][entity_type] = 1
|
||||
moves = ('M', 'B', 'I', 'L', 'U')
|
||||
for example in kwargs.get('examples', []):
|
||||
for token in example.y:
|
||||
ent_type = token.ent_type_
|
||||
|
@ -164,7 +160,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
if token.ent_type:
|
||||
labels.add(token.ent_type_)
|
||||
return labels
|
||||
|
||||
|
||||
def move_name(self, int move, attr_t label):
|
||||
if move == OUT:
|
||||
return 'O'
|
||||
|
@ -325,7 +321,6 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
raise TypeError(Errors.E909.format(name="BiluoGold"))
|
||||
cdef BiluoGold gold_ = gold
|
||||
gold_state = gold_.c
|
||||
n_gold = 0
|
||||
if self.c[i].is_valid(stcls.c, self.c[i].label):
|
||||
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
|
||||
else:
|
||||
|
@ -486,10 +481,8 @@ cdef class In:
|
|||
@staticmethod
|
||||
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
||||
gold = <GoldNERStateC*>_gold
|
||||
move = IN
|
||||
cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
|
||||
|
||||
if g_act == MISSING:
|
||||
|
@ -549,12 +542,10 @@ cdef class Last:
|
|||
@staticmethod
|
||||
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
||||
gold = <GoldNERStateC*>_gold
|
||||
move = LAST
|
||||
b0 = s.B(0)
|
||||
ent_start = s.E(0)
|
||||
|
||||
cdef int g_act = gold.ner[b0].move
|
||||
cdef attr_t g_tag = gold.ner[b0].label
|
||||
|
||||
cdef int cost = 0
|
||||
|
||||
|
@ -650,7 +641,6 @@ cdef class Unit:
|
|||
cost += 1
|
||||
break
|
||||
return cost
|
||||
|
||||
|
||||
|
||||
cdef class Out:
|
||||
|
@ -675,7 +665,6 @@ cdef class Out:
|
|||
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
||||
gold = <GoldNERStateC*>_gold
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||
cdef weight_t cost = 0
|
||||
if g_act == MISSING:
|
||||
pass
|
||||
|
|
|
@ -125,14 +125,17 @@ def decompose(label):
|
|||
def is_decorated(label):
|
||||
return DELIMITER in label
|
||||
|
||||
|
||||
def count_decorated_labels(gold_data):
|
||||
freqs = {}
|
||||
for example in gold_data:
|
||||
proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
|
||||
example.get_aligned("DEP"))
|
||||
# set the label to ROOT for each root dependent
|
||||
deco_deps = ['ROOT' if head == i else deco_deps[i]
|
||||
for i, head in enumerate(proj_heads)]
|
||||
deco_deps = [
|
||||
'ROOT' if head == i else deco_deps[i]
|
||||
for i, head in enumerate(proj_heads)
|
||||
]
|
||||
# count label frequencies
|
||||
for label in deco_deps:
|
||||
if is_decorated(label):
|
||||
|
@ -160,9 +163,9 @@ def projectivize(heads, labels):
|
|||
|
||||
|
||||
cdef vector[int] _heads_to_c(heads):
|
||||
cdef vector[int] c_heads;
|
||||
cdef vector[int] c_heads
|
||||
for head in heads:
|
||||
if head == None:
|
||||
if head is None:
|
||||
c_heads.push_back(-1)
|
||||
else:
|
||||
assert head < len(heads)
|
||||
|
@ -199,6 +202,7 @@ def _decorate(heads, proj_heads, labels):
|
|||
deco_labels.append(labels[tokenid])
|
||||
return deco_labels
|
||||
|
||||
|
||||
def get_smallest_nonproj_arc_slow(heads):
|
||||
cdef vector[int] c_heads = _heads_to_c(heads)
|
||||
return _get_smallest_nonproj_arc(c_heads)
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
# cython: infer_types=True
|
||||
import numpy
|
||||
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from ...tokens.doc cimport Doc
|
||||
|
@ -38,11 +36,11 @@ cdef class StateClass:
|
|||
cdef vector[ArcC] arcs
|
||||
self.c.get_arcs(&arcs)
|
||||
return list(arcs)
|
||||
#py_arcs = []
|
||||
#for arc in arcs:
|
||||
# if arc.head != -1 and arc.child != -1:
|
||||
# py_arcs.append((arc.head, arc.child, arc.label))
|
||||
#return arcs
|
||||
# py_arcs = []
|
||||
# for arc in arcs:
|
||||
# if arc.head != -1 and arc.child != -1:
|
||||
# py_arcs.append((arc.head, arc.child, arc.label))
|
||||
# return arcs
|
||||
|
||||
def add_arc(self, int head, int child, int label):
|
||||
self.c.add_arc(head, child, label)
|
||||
|
@ -52,10 +50,10 @@ cdef class StateClass:
|
|||
|
||||
def H(self, int child):
|
||||
return self.c.H(child)
|
||||
|
||||
|
||||
def L(self, int head, int idx):
|
||||
return self.c.L(head, idx)
|
||||
|
||||
|
||||
def R(self, int head, int idx):
|
||||
return self.c.R(head, idx)
|
||||
|
||||
|
@ -98,7 +96,7 @@ cdef class StateClass:
|
|||
|
||||
def H(self, int i):
|
||||
return self.c.H(i)
|
||||
|
||||
|
||||
def E(self, int i):
|
||||
return self.c.E(i)
|
||||
|
||||
|
@ -116,7 +114,7 @@ cdef class StateClass:
|
|||
|
||||
def H_(self, int i):
|
||||
return self.doc[self.c.H(i)]
|
||||
|
||||
|
||||
def E_(self, int i):
|
||||
return self.doc[self.c.E(i)]
|
||||
|
||||
|
@ -125,7 +123,7 @@ cdef class StateClass:
|
|||
|
||||
def R_(self, int i, int idx):
|
||||
return self.doc[self.c.R(i, idx)]
|
||||
|
||||
|
||||
def empty(self):
|
||||
return self.c.empty()
|
||||
|
||||
|
@ -134,7 +132,7 @@ cdef class StateClass:
|
|||
|
||||
def at_break(self):
|
||||
return False
|
||||
#return self.c.at_break()
|
||||
# return self.c.at_break()
|
||||
|
||||
def has_head(self, int i):
|
||||
return self.c.has_head(i)
|
||||
|
|
|
@ -20,11 +20,15 @@ cdef struct Transition:
|
|||
int (*do)(StateC* state, attr_t label) nogil
|
||||
|
||||
|
||||
ctypedef weight_t (*get_cost_func_t)(const StateC* state, const void* gold,
|
||||
attr_tlabel) nogil
|
||||
ctypedef weight_t (*move_cost_func_t)(const StateC* state, const void* gold) nogil
|
||||
ctypedef weight_t (*label_cost_func_t)(const StateC* state, const void*
|
||||
gold, attr_t label) nogil
|
||||
ctypedef weight_t (*get_cost_func_t)(
|
||||
const StateC* state, const void* gold, attr_tlabel
|
||||
) nogil
|
||||
ctypedef weight_t (*move_cost_func_t)(
|
||||
const StateC* state, const void* gold
|
||||
) nogil
|
||||
ctypedef weight_t (*label_cost_func_t)(
|
||||
const StateC* state, const void* gold, attr_t label
|
||||
) nogil
|
||||
|
||||
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
|
||||
|
||||
|
|
|
@ -8,9 +8,7 @@ from collections import Counter
|
|||
import srsly
|
||||
|
||||
from ...structs cimport TokenC
|
||||
from ...tokens.doc cimport Doc
|
||||
from ...typedefs cimport attr_t, weight_t
|
||||
from . cimport _beam_utils
|
||||
from .stateclass cimport StateClass
|
||||
|
||||
from ... import util
|
||||
|
@ -231,7 +229,6 @@ cdef class TransitionSystem:
|
|||
return self
|
||||
|
||||
def to_bytes(self, exclude=tuple()):
|
||||
transitions = []
|
||||
serializers = {
|
||||
'moves': lambda: srsly.json_dumps(self.labels),
|
||||
'strings': lambda: self.strings.to_bytes(),
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from collections import defaultdict
|
||||
from typing import Callable, Iterable, Optional
|
||||
from typing import Callable, Optional
|
||||
|
||||
from thinc.api import Config, Model
|
||||
|
||||
|
@ -124,6 +124,7 @@ def make_parser(
|
|||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"beam_parser",
|
||||
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
from itertools import islice
|
||||
from typing import Callable, Dict, Optional, Union
|
||||
|
||||
import srsly
|
||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
|
||||
|
||||
from ..morphology cimport Morphology
|
||||
|
@ -14,10 +13,8 @@ from ..errors import Errors
|
|||
from ..language import Language
|
||||
from ..parts_of_speech import IDS as POS_IDS
|
||||
from ..scorer import Scorer
|
||||
from ..symbols import POS
|
||||
from ..training import validate_examples, validate_get_examples
|
||||
from ..util import registry
|
||||
from .pipe import deserialize_config
|
||||
from .tagger import Tagger
|
||||
|
||||
# See #9050
|
||||
|
@ -76,8 +73,11 @@ def morphologizer_score(examples, **kwargs):
|
|||
results = {}
|
||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
|
||||
results.update(Scorer.score_token_attr_per_feat(examples,
|
||||
"morph", getter=morph_key_getter, **kwargs))
|
||||
results.update(
|
||||
Scorer.score_token_attr_per_feat(
|
||||
examples, "morph", getter=morph_key_getter, **kwargs
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
|
@ -233,7 +233,6 @@ class Morphologizer(Tagger):
|
|||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
cdef Doc doc
|
||||
cdef Vocab vocab = self.vocab
|
||||
cdef bint overwrite = self.cfg["overwrite"]
|
||||
cdef bint extend = self.cfg["extend"]
|
||||
labels = self.labels
|
||||
|
|
|
@ -4,13 +4,10 @@ from typing import Optional
|
|||
import numpy
|
||||
from thinc.api import Config, CosineDistance, Model, set_dropout_rate, to_categorical
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
|
||||
from ..attrs import ID, POS
|
||||
from ..attrs import ID
|
||||
from ..errors import Errors
|
||||
from ..language import Language
|
||||
from ..training import validate_examples
|
||||
from ._parser_internals import nonproj
|
||||
from .tagger import Tagger
|
||||
from .trainable_pipe import TrainablePipe
|
||||
|
||||
|
@ -103,10 +100,9 @@ class MultitaskObjective(Tagger):
|
|||
cdef int idx = 0
|
||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
||||
guesses = scores.argmax(axis=1)
|
||||
docs = [eg.predicted for eg in examples]
|
||||
for i, eg in enumerate(examples):
|
||||
# Handles alignment for tokenization differences
|
||||
doc_annots = eg.get_aligned() # TODO
|
||||
_doc_annots = eg.get_aligned() # TODO
|
||||
for j in range(len(eg.predicted)):
|
||||
tok_annots = {key: values[j] for key, values in tok_annots.items()}
|
||||
label = self.make_label(j, tok_annots)
|
||||
|
@ -206,7 +202,6 @@ class ClozeMultitask(TrainablePipe):
|
|||
losses[self.name] = 0.
|
||||
set_dropout_rate(self.model, drop)
|
||||
validate_examples(examples, "ClozeMultitask.rehearse")
|
||||
docs = [eg.predicted for eg in examples]
|
||||
predictions, bp_predictions = self.model.begin_update()
|
||||
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
|
||||
bp_predictions(d_predictions)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from collections import defaultdict
|
||||
from typing import Callable, Iterable, Optional
|
||||
from typing import Callable, Optional
|
||||
|
||||
from thinc.api import Config, Model
|
||||
|
||||
|
@ -10,7 +10,7 @@ from ._parser_internals.ner cimport BiluoPushDown
|
|||
from .transition_parser cimport Parser
|
||||
|
||||
from ..language import Language
|
||||
from ..scorer import PRFScore, get_ner_prf
|
||||
from ..scorer import get_ner_prf
|
||||
from ..training import remove_bilu_prefix
|
||||
from ..util import registry
|
||||
|
||||
|
@ -100,6 +100,7 @@ def make_ner(
|
|||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"beam_ner",
|
||||
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
import warnings
|
||||
from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple, Union
|
||||
from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
|
||||
|
||||
import srsly
|
||||
|
||||
|
@ -40,7 +40,7 @@ cdef class Pipe:
|
|||
"""
|
||||
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="__call__", name=self.name))
|
||||
|
||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
|
||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||
"""Apply the pipe to a stream of documents. This usually happens under
|
||||
the hood when the nlp object is called on a text and all components are
|
||||
applied to the Doc.
|
||||
|
@ -59,7 +59,7 @@ cdef class Pipe:
|
|||
except Exception as e:
|
||||
error_handler(self.name, self, [doc], e)
|
||||
|
||||
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None):
|
||||
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language = None):
|
||||
"""Initialize the pipe. For non-trainable components, this method
|
||||
is optional. For trainable components, which should inherit
|
||||
from the subclass TrainablePipe, the provided data examples
|
||||
|
|
|
@ -7,13 +7,13 @@ from ..tokens.doc cimport Doc
|
|||
|
||||
from .. import util
|
||||
from ..language import Language
|
||||
from ..scorer import Scorer
|
||||
from .pipe import Pipe
|
||||
from .senter import senter_score
|
||||
|
||||
# see #9050
|
||||
BACKWARD_OVERWRITE = False
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"sentencizer",
|
||||
assigns=["token.is_sent_start", "doc.sents"],
|
||||
|
@ -36,17 +36,19 @@ class Sentencizer(Pipe):
|
|||
DOCS: https://spacy.io/api/sentencizer
|
||||
"""
|
||||
|
||||
default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
|
||||
'।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
|
||||
'᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
|
||||
'‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶',
|
||||
'꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒',
|
||||
'﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀',
|
||||
'𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼',
|
||||
'𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
|
||||
'𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
|
||||
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
|
||||
'。', '。']
|
||||
default_punct_chars = [
|
||||
'!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
|
||||
'।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
|
||||
'᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
|
||||
'‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶',
|
||||
'꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒',
|
||||
'﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀',
|
||||
'𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼',
|
||||
'𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
|
||||
'𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
|
||||
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
|
||||
'。', '。'
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -128,7 +130,6 @@ class Sentencizer(Pipe):
|
|||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
cdef Doc doc
|
||||
cdef int idx = 0
|
||||
for i, doc in enumerate(docs):
|
||||
doc_tag_ids = batch_tag_ids[i]
|
||||
for j, tag_id in enumerate(doc_tag_ids):
|
||||
|
@ -169,7 +170,6 @@ class Sentencizer(Pipe):
|
|||
path = path.with_suffix(".json")
|
||||
srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
|
||||
|
||||
|
||||
def from_disk(self, path, *, exclude=tuple()):
|
||||
"""Load the sentencizer from disk.
|
||||
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
from itertools import islice
|
||||
from typing import Callable, Optional
|
||||
|
||||
import srsly
|
||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
|
|
|
@ -1,26 +1,18 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
import warnings
|
||||
from itertools import islice
|
||||
from typing import Callable, Optional
|
||||
|
||||
import numpy
|
||||
import srsly
|
||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
|
||||
from thinc.types import Floats2d
|
||||
|
||||
from ..morphology cimport Morphology
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..vocab cimport Vocab
|
||||
|
||||
from .. import util
|
||||
from ..attrs import ID, POS
|
||||
from ..errors import Errors, Warnings
|
||||
from ..errors import Errors
|
||||
from ..language import Language
|
||||
from ..parts_of_speech import X
|
||||
from ..scorer import Scorer
|
||||
from ..training import validate_examples, validate_get_examples
|
||||
from ..util import registry
|
||||
from .pipe import deserialize_config
|
||||
from .trainable_pipe import TrainablePipe
|
||||
|
||||
# See #9050
|
||||
|
@ -169,7 +161,6 @@ class Tagger(TrainablePipe):
|
|||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
cdef Doc doc
|
||||
cdef Vocab vocab = self.vocab
|
||||
cdef bint overwrite = self.cfg["overwrite"]
|
||||
labels = self.labels
|
||||
for i, doc in enumerate(docs):
|
||||
|
|
|
@ -55,7 +55,7 @@ cdef class TrainablePipe(Pipe):
|
|||
except Exception as e:
|
||||
error_handler(self.name, self, [doc], e)
|
||||
|
||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
|
||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||
"""Apply the pipe to a stream of documents. This usually happens under
|
||||
the hood when the nlp object is called on a text and all components are
|
||||
applied to the Doc.
|
||||
|
@ -102,9 +102,9 @@ cdef class TrainablePipe(Pipe):
|
|||
def update(self,
|
||||
examples: Iterable["Example"],
|
||||
*,
|
||||
drop: float=0.0,
|
||||
sgd: Optimizer=None,
|
||||
losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
|
||||
drop: float = 0.0,
|
||||
sgd: Optimizer = None,
|
||||
losses: Optional[Dict[str, float]] = None) -> Dict[str, float]:
|
||||
"""Learn from a batch of documents and gold-standard information,
|
||||
updating the pipe's model. Delegates to predict and get_loss.
|
||||
|
||||
|
@ -138,8 +138,8 @@ cdef class TrainablePipe(Pipe):
|
|||
def rehearse(self,
|
||||
examples: Iterable[Example],
|
||||
*,
|
||||
sgd: Optimizer=None,
|
||||
losses: Dict[str, float]=None,
|
||||
sgd: Optimizer = None,
|
||||
losses: Dict[str, float] = None,
|
||||
**config) -> Dict[str, float]:
|
||||
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
|
||||
teach the current model to make predictions similar to an initial model,
|
||||
|
@ -177,7 +177,7 @@ cdef class TrainablePipe(Pipe):
|
|||
"""
|
||||
return util.create_default_optimizer()
|
||||
|
||||
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None):
|
||||
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language = None):
|
||||
"""Initialize the pipe for training, using data examples if available.
|
||||
This method needs to be implemented by each TrainablePipe component,
|
||||
ensuring the internal model (if available) is initialized properly
|
||||
|
|
|
@ -13,8 +13,18 @@ cdef class Parser(TrainablePipe):
|
|||
cdef readonly TransitionSystem moves
|
||||
cdef public object _multitasks
|
||||
|
||||
cdef void _parseC(self, CBlas cblas, StateC** states,
|
||||
WeightsC weights, SizesC sizes) nogil
|
||||
cdef void _parseC(
|
||||
self,
|
||||
CBlas cblas,
|
||||
StateC** states,
|
||||
WeightsC weights,
|
||||
SizesC sizes
|
||||
) nogil
|
||||
|
||||
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
||||
int nr_class, int batch_size) nogil
|
||||
cdef void c_transition_batch(
|
||||
self,
|
||||
StateC** states,
|
||||
const float* scores,
|
||||
int nr_class,
|
||||
int batch_size
|
||||
) nogil
|
||||
|
|
|
@ -7,20 +7,15 @@ from cymem.cymem cimport Pool
|
|||
from itertools import islice
|
||||
|
||||
from libc.stdlib cimport calloc, free
|
||||
from libc.string cimport memcpy, memset
|
||||
from libc.string cimport memset
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
import random
|
||||
|
||||
import srsly
|
||||
from thinc.api import CupyOps, NumpyOps, get_ops, set_dropout_rate
|
||||
|
||||
from thinc.extra.search cimport Beam
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy
|
||||
import numpy.random
|
||||
import srsly
|
||||
from thinc.api import CupyOps, NumpyOps, set_dropout_rate
|
||||
|
||||
from ..ml.parser_model cimport (
|
||||
ActivationsC,
|
||||
|
@ -42,7 +37,7 @@ from .trainable_pipe import TrainablePipe
|
|||
from ._parser_internals cimport _beam_utils
|
||||
|
||||
from .. import util
|
||||
from ..errors import Errors, Warnings
|
||||
from ..errors import Errors
|
||||
from ..training import validate_examples, validate_get_examples
|
||||
from ._parser_internals import _beam_utils
|
||||
|
||||
|
@ -258,7 +253,6 @@ cdef class Parser(TrainablePipe):
|
|||
except Exception as e:
|
||||
error_handler(self.name, self, batch_in_order, e)
|
||||
|
||||
|
||||
def predict(self, docs):
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
|
@ -300,8 +294,6 @@ cdef class Parser(TrainablePipe):
|
|||
return batch
|
||||
|
||||
def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
|
||||
cdef Beam beam
|
||||
cdef Doc doc
|
||||
self._ensure_labels_are_added(docs)
|
||||
batch = _beam_utils.BeamBatch(
|
||||
self.moves,
|
||||
|
@ -321,16 +313,18 @@ cdef class Parser(TrainablePipe):
|
|||
del model
|
||||
return list(batch)
|
||||
|
||||
cdef void _parseC(self, CBlas cblas, StateC** states,
|
||||
WeightsC weights, SizesC sizes) nogil:
|
||||
cdef int i, j
|
||||
cdef void _parseC(
|
||||
self, CBlas cblas, StateC** states, WeightsC weights, SizesC sizes
|
||||
) nogil:
|
||||
cdef int i
|
||||
cdef vector[StateC*] unfinished
|
||||
cdef ActivationsC activations = alloc_activations(sizes)
|
||||
while sizes.states >= 1:
|
||||
predict_states(cblas, &activations, states, &weights, sizes)
|
||||
# Validate actions, argmax, take action.
|
||||
self.c_transition_batch(states,
|
||||
activations.scores, sizes.classes, sizes.states)
|
||||
self.c_transition_batch(
|
||||
states, activations.scores, sizes.classes, sizes.states
|
||||
)
|
||||
for i in range(sizes.states):
|
||||
if not states[i].is_final():
|
||||
unfinished.push_back(states[i])
|
||||
|
@ -342,7 +336,6 @@ cdef class Parser(TrainablePipe):
|
|||
|
||||
def set_annotations(self, docs, states_or_beams):
|
||||
cdef StateClass state
|
||||
cdef Beam beam
|
||||
cdef Doc doc
|
||||
states = _beam_utils.collect_states(states_or_beams, docs)
|
||||
for i, (state, doc) in enumerate(zip(states, docs)):
|
||||
|
@ -359,8 +352,13 @@ cdef class Parser(TrainablePipe):
|
|||
self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
|
||||
return [state for state in states if not state.c.is_final()]
|
||||
|
||||
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
||||
int nr_class, int batch_size) nogil:
|
||||
cdef void c_transition_batch(
|
||||
self,
|
||||
StateC** states,
|
||||
const float* scores,
|
||||
int nr_class,
|
||||
int batch_size
|
||||
) nogil:
|
||||
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||
with gil:
|
||||
assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
|
||||
|
@ -380,7 +378,6 @@ cdef class Parser(TrainablePipe):
|
|||
free(is_valid)
|
||||
|
||||
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
||||
cdef StateClass state
|
||||
if losses is None:
|
||||
losses = {}
|
||||
losses.setdefault(self.name, 0.)
|
||||
|
@ -419,8 +416,7 @@ cdef class Parser(TrainablePipe):
|
|||
if not states:
|
||||
return losses
|
||||
model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
|
||||
|
||||
all_states = list(states)
|
||||
|
||||
states_golds = list(zip(states, golds))
|
||||
n_moves = 0
|
||||
while states_golds:
|
||||
|
@ -500,8 +496,16 @@ cdef class Parser(TrainablePipe):
|
|||
del tutor
|
||||
return losses
|
||||
|
||||
def update_beam(self, examples, *, beam_width,
|
||||
drop=0., sgd=None, losses=None, beam_density=0.0):
|
||||
def update_beam(
|
||||
self,
|
||||
examples,
|
||||
*,
|
||||
beam_width,
|
||||
drop=0.,
|
||||
sgd=None,
|
||||
losses=None,
|
||||
beam_density=0.0
|
||||
):
|
||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
||||
if not states:
|
||||
return losses
|
||||
|
@ -531,8 +535,9 @@ cdef class Parser(TrainablePipe):
|
|||
|
||||
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
|
||||
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
|
||||
cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
|
||||
dtype='f', order='C')
|
||||
cdef np.ndarray d_scores = numpy.zeros(
|
||||
(len(states), self.moves.n_moves), dtype='f', order='C'
|
||||
)
|
||||
c_d_scores = <float*>d_scores.data
|
||||
unseen_classes = self.model.attrs["unseen_classes"]
|
||||
for i, (state, gold) in enumerate(zip(states, golds)):
|
||||
|
@ -542,8 +547,9 @@ cdef class Parser(TrainablePipe):
|
|||
for j in range(self.moves.n_moves):
|
||||
if costs[j] <= 0.0 and j in unseen_classes:
|
||||
unseen_classes.remove(j)
|
||||
cpu_log_loss(c_d_scores,
|
||||
costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
||||
cpu_log_loss(
|
||||
c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1]
|
||||
)
|
||||
c_d_scores += d_scores.shape[1]
|
||||
# Note that we don't normalize this. See comment in update() for why.
|
||||
if losses is not None:
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
cimport cython
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.string cimport memcpy
|
||||
from libcpp.set cimport set
|
||||
from murmurhash.mrmr cimport hash32, hash64
|
||||
|
||||
import srsly
|
||||
|
@ -20,9 +19,10 @@ cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
|
|||
try:
|
||||
out_hash[0] = key
|
||||
return True
|
||||
except:
|
||||
except: # no-cython-lint
|
||||
return False
|
||||
|
||||
|
||||
def get_string_id(key):
|
||||
"""Get a string ID, handling the reserved symbols correctly. If the key is
|
||||
already an ID, return it.
|
||||
|
@ -87,7 +87,6 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
|
|||
cdef int n_length_bytes
|
||||
cdef int i
|
||||
cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
|
||||
cdef uint32_t ulength = length
|
||||
if length < sizeof(string.s):
|
||||
string.s[0] = <unsigned char>length
|
||||
memcpy(&string.s[1], chars, length)
|
||||
|
|
|
@ -52,7 +52,7 @@ cdef struct TokenC:
|
|||
|
||||
int sent_start
|
||||
int ent_iob
|
||||
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
||||
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
||||
attr_t ent_kb_id
|
||||
hash_t ent_id
|
||||
|
||||
|
|
|
@ -92,7 +92,7 @@ cdef enum symbol_t:
|
|||
ADV
|
||||
AUX
|
||||
CONJ
|
||||
CCONJ # U20
|
||||
CCONJ # U20
|
||||
DET
|
||||
INTJ
|
||||
NOUN
|
||||
|
@ -418,7 +418,7 @@ cdef enum symbol_t:
|
|||
ccomp
|
||||
complm
|
||||
conj
|
||||
cop # U20
|
||||
cop # U20
|
||||
csubj
|
||||
csubjpass
|
||||
dep
|
||||
|
@ -441,8 +441,8 @@ cdef enum symbol_t:
|
|||
num
|
||||
number
|
||||
oprd
|
||||
obj # U20
|
||||
obl # U20
|
||||
obj # U20
|
||||
obl # U20
|
||||
parataxis
|
||||
partmod
|
||||
pcomp
|
||||
|
|
|
@ -96,7 +96,7 @@ IDS = {
|
|||
"ADV": ADV,
|
||||
"AUX": AUX,
|
||||
"CONJ": CONJ,
|
||||
"CCONJ": CCONJ, # U20
|
||||
"CCONJ": CCONJ, # U20
|
||||
"DET": DET,
|
||||
"INTJ": INTJ,
|
||||
"NOUN": NOUN,
|
||||
|
@ -421,7 +421,7 @@ IDS = {
|
|||
"ccomp": ccomp,
|
||||
"complm": complm,
|
||||
"conj": conj,
|
||||
"cop": cop, # U20
|
||||
"cop": cop, # U20
|
||||
"csubj": csubj,
|
||||
"csubjpass": csubjpass,
|
||||
"dep": dep,
|
||||
|
@ -444,8 +444,8 @@ IDS = {
|
|||
"num": num,
|
||||
"number": number,
|
||||
"oprd": oprd,
|
||||
"obj": obj, # U20
|
||||
"obl": obl, # U20
|
||||
"obj": obj, # U20
|
||||
"obl": obl, # U20
|
||||
"parataxis": parataxis,
|
||||
"partmod": partmod,
|
||||
"pcomp": pcomp,
|
||||
|
|
|
@ -52,7 +52,8 @@ TEST_PATTERNS = [
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pattern", [[{"XX": "y"}, {"LENGTH": "2"}, {"TEXT": {"IN": 5}}]]
|
||||
"pattern",
|
||||
[[{"XX": "y"}], [{"LENGTH": "2"}], [{"TEXT": {"IN": 5}}], [{"text": {"in": 6}}]],
|
||||
)
|
||||
def test_matcher_pattern_validation(en_vocab, pattern):
|
||||
matcher = Matcher(en_vocab, validate=True)
|
||||
|
|
|
@ -12,6 +12,7 @@ def test_build_dependencies():
|
|||
"flake8",
|
||||
"hypothesis",
|
||||
"pre-commit",
|
||||
"cython-lint",
|
||||
"black",
|
||||
"isort",
|
||||
"mypy",
|
||||
|
|
|
@ -377,3 +377,22 @@ def test_displacy_manual_sorted_entities():
|
|||
|
||||
html = displacy.render(doc, style="ent", manual=True)
|
||||
assert html.find("FIRST") < html.find("SECOND")
|
||||
|
||||
|
||||
@pytest.mark.issue(12816)
|
||||
def test_issue12816(en_vocab) -> None:
|
||||
"""Test that displaCy's span visualizer escapes annotated HTML tags correctly."""
|
||||
# Create a doc containing an annotated word and an unannotated HTML tag
|
||||
doc = Doc(en_vocab, words=["test", "<TEST>"])
|
||||
doc.spans["sc"] = [Span(doc, 0, 1, label="test")]
|
||||
|
||||
# Verify that the HTML tag is escaped when unannotated
|
||||
html = displacy.render(doc, style="span")
|
||||
assert "<TEST>" in html
|
||||
|
||||
# Annotate the HTML tag
|
||||
doc.spans["sc"].append(Span(doc, 1, 2, label="test"))
|
||||
|
||||
# Verify that the HTML tag is still escaped
|
||||
html = displacy.render(doc, style="span")
|
||||
assert "<TEST>" in html
|
||||
|
|
|
@ -31,24 +31,58 @@ cdef class Tokenizer:
|
|||
|
||||
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
|
||||
cdef int _apply_special_cases(self, Doc doc) except -1
|
||||
cdef void _filter_special_spans(self, vector[SpanC] &original,
|
||||
vector[SpanC] &filtered, int doc_len) nogil
|
||||
cdef object _prepare_special_spans(self, Doc doc,
|
||||
vector[SpanC] &filtered)
|
||||
cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
|
||||
object span_data)
|
||||
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
|
||||
int* has_special,
|
||||
bint with_special_cases) except -1
|
||||
cdef int _tokenize(self, Doc tokens, str span, hash_t key,
|
||||
int* has_special, bint with_special_cases) except -1
|
||||
cdef str _split_affixes(self, Pool mem, str string,
|
||||
vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes, int* has_special,
|
||||
bint with_special_cases)
|
||||
cdef int _attach_tokens(self, Doc tokens, str string,
|
||||
vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes, int* has_special,
|
||||
bint with_special_cases) except -1
|
||||
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
|
||||
int* has_special, int n) except -1
|
||||
cdef void _filter_special_spans(
|
||||
self,
|
||||
vector[SpanC] &original,
|
||||
vector[SpanC] &filtered,
|
||||
int doc_len,
|
||||
) nogil
|
||||
cdef object _prepare_special_spans(
|
||||
self,
|
||||
Doc doc,
|
||||
vector[SpanC] &filtered,
|
||||
)
|
||||
cdef int _retokenize_special_spans(
|
||||
self,
|
||||
Doc doc,
|
||||
TokenC* tokens,
|
||||
object span_data,
|
||||
)
|
||||
cdef int _try_specials_and_cache(
|
||||
self,
|
||||
hash_t key,
|
||||
Doc tokens,
|
||||
int* has_special,
|
||||
bint with_special_cases,
|
||||
) except -1
|
||||
cdef int _tokenize(
|
||||
self,
|
||||
Doc tokens,
|
||||
str span,
|
||||
hash_t key,
|
||||
int* has_special,
|
||||
bint with_special_cases,
|
||||
) except -1
|
||||
cdef str _split_affixes(
|
||||
self,
|
||||
Pool mem,
|
||||
str string,
|
||||
vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes, int* has_special,
|
||||
bint with_special_cases,
|
||||
)
|
||||
cdef int _attach_tokens(
|
||||
self,
|
||||
Doc tokens,
|
||||
str string,
|
||||
vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes, int* has_special,
|
||||
bint with_special_cases,
|
||||
) except -1
|
||||
cdef int _save_cached(
|
||||
self,
|
||||
const TokenC* tokens,
|
||||
hash_t key,
|
||||
int* has_special,
|
||||
int n,
|
||||
) except -1
|
||||
|
|
|
@ -8,20 +8,18 @@ from libcpp.set cimport set as stdset
|
|||
from preshed.maps cimport PreshMap
|
||||
|
||||
import re
|
||||
import warnings
|
||||
|
||||
from .lexeme cimport EMPTY_LEXEME
|
||||
from .strings cimport hash_string
|
||||
from .tokens.doc cimport Doc
|
||||
|
||||
from . import util
|
||||
from .attrs import intify_attrs
|
||||
from .errors import Errors, Warnings
|
||||
from .errors import Errors
|
||||
from .scorer import Scorer
|
||||
from .symbols import NORM, ORTH
|
||||
from .tokens import Span
|
||||
from .training import validate_examples
|
||||
from .util import get_words_and_spaces, registry
|
||||
from .util import get_words_and_spaces
|
||||
|
||||
|
||||
cdef class Tokenizer:
|
||||
|
@ -324,7 +322,7 @@ cdef class Tokenizer:
|
|||
cdef int span_start
|
||||
cdef int span_end
|
||||
while i < doc.length:
|
||||
if not i in span_data:
|
||||
if i not in span_data:
|
||||
tokens[i + offset] = doc.c[i]
|
||||
i += 1
|
||||
else:
|
||||
|
@ -395,12 +393,15 @@ cdef class Tokenizer:
|
|||
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
||||
tokens.length - orig_size)
|
||||
|
||||
cdef str _split_affixes(self, Pool mem, str string,
|
||||
vector[const LexemeC*] *prefixes,
|
||||
vector[const LexemeC*] *suffixes,
|
||||
int* has_special,
|
||||
bint with_special_cases):
|
||||
cdef size_t i
|
||||
cdef str _split_affixes(
|
||||
self,
|
||||
Pool mem,
|
||||
str string,
|
||||
vector[const LexemeC*] *prefixes,
|
||||
vector[const LexemeC*] *suffixes,
|
||||
int* has_special,
|
||||
bint with_special_cases
|
||||
):
|
||||
cdef str prefix
|
||||
cdef str suffix
|
||||
cdef str minus_pre
|
||||
|
@ -445,10 +446,6 @@ cdef class Tokenizer:
|
|||
vector[const LexemeC*] *suffixes,
|
||||
int* has_special,
|
||||
bint with_special_cases) except -1:
|
||||
cdef bint specials_hit = 0
|
||||
cdef bint cache_hit = 0
|
||||
cdef int split, end
|
||||
cdef const LexemeC* const* lexemes
|
||||
cdef const LexemeC* lexeme
|
||||
cdef str span
|
||||
cdef int i
|
||||
|
@ -458,9 +455,11 @@ cdef class Tokenizer:
|
|||
if string:
|
||||
if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
|
||||
pass
|
||||
elif (self.token_match and self.token_match(string)) or \
|
||||
(self.url_match and \
|
||||
self.url_match(string)):
|
||||
elif (
|
||||
(self.token_match and self.token_match(string)) or
|
||||
(self.url_match and self.url_match(string))
|
||||
):
|
||||
|
||||
# We're always saying 'no' to spaces here -- the caller will
|
||||
# fix up the outermost one, with reference to the original.
|
||||
# See Issue #859
|
||||
|
@ -821,7 +820,7 @@ cdef class Tokenizer:
|
|||
self.infix_finditer = None
|
||||
self.token_match = None
|
||||
self.url_match = None
|
||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||
util.from_bytes(bytes_data, deserializers, exclude)
|
||||
if "prefix_search" in data and isinstance(data["prefix_search"], str):
|
||||
self.prefix_search = re.compile(data["prefix_search"]).search
|
||||
if "suffix_search" in data and isinstance(data["suffix_search"], str):
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# cython: infer_types=True, bounds_check=False, profile=True
|
||||
from cymem.cymem cimport Pool
|
||||
from libc.stdlib cimport free, malloc
|
||||
from libc.string cimport memcpy, memset
|
||||
from libc.string cimport memset
|
||||
|
||||
import numpy
|
||||
from thinc.api import get_array_module
|
||||
|
@ -10,7 +9,7 @@ from ..attrs cimport MORPH, NORM
|
|||
from ..lexeme cimport EMPTY_LEXEME, Lexeme
|
||||
from ..structs cimport LexemeC, TokenC
|
||||
from ..vocab cimport Vocab
|
||||
from .doc cimport Doc, set_children_from_heads, token_by_end, token_by_start
|
||||
from .doc cimport Doc, set_children_from_heads, token_by_start
|
||||
from .span cimport Span
|
||||
from .token cimport Token
|
||||
|
||||
|
@ -147,7 +146,7 @@ def _merge(Doc doc, merges):
|
|||
syntactic root of the span.
|
||||
RETURNS (Token): The first newly merged token.
|
||||
"""
|
||||
cdef int i, merge_index, start, end, token_index, current_span_index, current_offset, offset, span_index
|
||||
cdef int i, merge_index, start, token_index, current_span_index, current_offset, offset, span_index
|
||||
cdef Span span
|
||||
cdef const LexemeC* lex
|
||||
cdef TokenC* token
|
||||
|
@ -165,7 +164,6 @@ def _merge(Doc doc, merges):
|
|||
merges.sort(key=_get_start)
|
||||
for merge_index, (span, attributes) in enumerate(merges):
|
||||
start = span.start
|
||||
end = span.end
|
||||
spans.append(span)
|
||||
# House the new merged token where it starts
|
||||
token = &doc.c[start]
|
||||
|
@ -203,8 +201,9 @@ def _merge(Doc doc, merges):
|
|||
# for the merged region. To do this, we create a boolean array indicating
|
||||
# whether the row is to be deleted, then use numpy.delete
|
||||
if doc.tensor is not None and doc.tensor.size != 0:
|
||||
doc.tensor = _resize_tensor(doc.tensor,
|
||||
[(m[0].start, m[0].end) for m in merges])
|
||||
doc.tensor = _resize_tensor(
|
||||
doc.tensor, [(m[0].start, m[0].end) for m in merges]
|
||||
)
|
||||
# Memorize span roots and sets dependencies of the newly merged
|
||||
# tokens to the dependencies of their roots.
|
||||
span_roots = []
|
||||
|
@ -267,11 +266,11 @@ def _merge(Doc doc, merges):
|
|||
span_index += 1
|
||||
if span_index < len(spans) and i == spans[span_index].start:
|
||||
# First token in a span
|
||||
doc.c[i - offset] = doc.c[i] # move token to its place
|
||||
doc.c[i - offset] = doc.c[i] # move token to its place
|
||||
offset += (spans[span_index].end - spans[span_index].start) - 1
|
||||
in_span = True
|
||||
if not in_span:
|
||||
doc.c[i - offset] = doc.c[i] # move token to its place
|
||||
doc.c[i - offset] = doc.c[i] # move token to its place
|
||||
|
||||
for i in range(doc.length - offset, doc.length):
|
||||
memset(&doc.c[i], 0, sizeof(TokenC))
|
||||
|
@ -345,7 +344,11 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
|||
if to_process_tensor:
|
||||
xp = get_array_module(doc.tensor)
|
||||
if xp is numpy:
|
||||
doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0)
|
||||
doc.tensor = xp.append(
|
||||
doc.tensor,
|
||||
xp.zeros((nb_subtokens, doc.tensor.shape[1]), dtype="float32"),
|
||||
axis=0
|
||||
)
|
||||
else:
|
||||
shape = (doc.tensor.shape[0] + nb_subtokens, doc.tensor.shape[1])
|
||||
resized_array = xp.zeros(shape, dtype="float32")
|
||||
|
@ -367,7 +370,8 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
|||
token.norm = 0 # reset norm
|
||||
if to_process_tensor:
|
||||
# setting the tensors of the split tokens to array of zeros
|
||||
doc.tensor[token_index + i:token_index + i + 1] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32")
|
||||
doc.tensor[token_index + i:token_index + i + 1] = \
|
||||
xp.zeros((1, doc.tensor.shape[1]), dtype="float32")
|
||||
# Update the character offset of the subtokens
|
||||
if i != 0:
|
||||
token.idx = orig_token.idx + idx_offset
|
||||
|
@ -455,7 +459,6 @@ def normalize_token_attrs(Vocab vocab, attrs):
|
|||
def set_token_attrs(Token py_token, attrs):
|
||||
cdef TokenC* token = py_token.c
|
||||
cdef const LexemeC* lex = token.lex
|
||||
cdef Doc doc = py_token.doc
|
||||
# Assign attributes
|
||||
for attr_name, attr_value in attrs.items():
|
||||
if attr_name == "_": # Set extension attributes
|
||||
|
|
|
@ -31,7 +31,7 @@ cdef int token_by_start(const TokenC* tokens, int length, int start_char) except
|
|||
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
|
||||
|
||||
|
||||
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
||||
cdef int [:, :] _get_lca_matrix(Doc, int start, int end)
|
||||
|
||||
|
||||
cdef class Doc:
|
||||
|
@ -61,7 +61,6 @@ cdef class Doc:
|
|||
cdef int length
|
||||
cdef int max_length
|
||||
|
||||
|
||||
cdef public object noun_chunks_iterator
|
||||
|
||||
cdef object __weakref__
|
||||
|
|
|
@ -43,14 +43,13 @@ from ..attrs cimport (
|
|||
attr_id_t,
|
||||
)
|
||||
from ..lexeme cimport EMPTY_LEXEME, Lexeme
|
||||
from ..typedefs cimport attr_t, flags_t
|
||||
from ..typedefs cimport attr_t
|
||||
from .token cimport Token
|
||||
|
||||
from .. import parts_of_speech, schemas, util
|
||||
from ..attrs import IDS, intify_attr
|
||||
from ..compat import copy_reg, pickle
|
||||
from ..compat import copy_reg
|
||||
from ..errors import Errors, Warnings
|
||||
from ..morphology import Morphology
|
||||
from ..util import get_words_and_spaces
|
||||
from ._retokenize import Retokenizer
|
||||
from .underscore import Underscore, get_ext_args
|
||||
|
@ -784,7 +783,7 @@ cdef class Doc:
|
|||
# TODO:
|
||||
# 1. Test basic data-driven ORTH gazetteer
|
||||
# 2. Test more nuanced date and currency regex
|
||||
cdef attr_t entity_type, kb_id, ent_id
|
||||
cdef attr_t kb_id, ent_id
|
||||
cdef int ent_start, ent_end
|
||||
ent_spans = []
|
||||
for ent_info in ents:
|
||||
|
@ -987,7 +986,6 @@ cdef class Doc:
|
|||
>>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
||||
"""
|
||||
cdef int i, j
|
||||
cdef attr_id_t feature
|
||||
cdef np.ndarray[attr_t, ndim=2] output
|
||||
# Handle scalar/list inputs of strings/ints for py_attr_ids
|
||||
# See also #3064
|
||||
|
@ -999,8 +997,10 @@ cdef class Doc:
|
|||
py_attr_ids = [py_attr_ids]
|
||||
# Allow strings, e.g. 'lemma' or 'LEMMA'
|
||||
try:
|
||||
py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
|
||||
for id_ in py_attr_ids]
|
||||
py_attr_ids = [
|
||||
(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
|
||||
for id_ in py_attr_ids
|
||||
]
|
||||
except KeyError as msg:
|
||||
keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
|
||||
raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
|
||||
|
@ -1030,8 +1030,6 @@ cdef class Doc:
|
|||
DOCS: https://spacy.io/api/doc#count_by
|
||||
"""
|
||||
cdef int i
|
||||
cdef attr_t attr
|
||||
cdef size_t count
|
||||
|
||||
if counts is None:
|
||||
counts = Counter()
|
||||
|
@ -1093,7 +1091,6 @@ cdef class Doc:
|
|||
cdef int i, col
|
||||
cdef int32_t abs_head_index
|
||||
cdef attr_id_t attr_id
|
||||
cdef TokenC* tokens = self.c
|
||||
cdef int length = len(array)
|
||||
if length != len(self):
|
||||
raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
|
||||
|
@ -1225,7 +1222,7 @@ cdef class Doc:
|
|||
span.label,
|
||||
span.kb_id,
|
||||
span.id,
|
||||
span.text, # included as a check
|
||||
span.text, # included as a check
|
||||
))
|
||||
char_offset += len(doc.text)
|
||||
if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
|
||||
|
@ -1508,7 +1505,6 @@ cdef class Doc:
|
|||
attributes are inherited from the syntactic root of the span.
|
||||
RETURNS (Token): The first newly merged token.
|
||||
"""
|
||||
cdef str tag, lemma, ent_type
|
||||
attr_len = len(attributes)
|
||||
span_len = len(spans)
|
||||
if not attr_len == span_len:
|
||||
|
@ -1624,7 +1620,6 @@ cdef class Doc:
|
|||
for token in char_span[1:]:
|
||||
token.is_sent_start = False
|
||||
|
||||
|
||||
for span_group in doc_json.get("spans", {}):
|
||||
spans = []
|
||||
for span in doc_json["spans"][span_group]:
|
||||
|
@ -1656,7 +1651,7 @@ cdef class Doc:
|
|||
start = token_by_char(self.c, self.length, token_data["start"])
|
||||
value = token_data["value"]
|
||||
self[start]._.set(token_attr, value)
|
||||
|
||||
|
||||
for span_attr in doc_json.get("underscore_span", {}):
|
||||
if not Span.has_extension(span_attr):
|
||||
Span.set_extension(span_attr)
|
||||
|
@ -1698,7 +1693,7 @@ cdef class Doc:
|
|||
token_data["dep"] = token.dep_
|
||||
token_data["head"] = token.head.i
|
||||
data["tokens"].append(token_data)
|
||||
|
||||
|
||||
if self.spans:
|
||||
data["spans"] = {}
|
||||
for span_group in self.spans:
|
||||
|
@ -1769,7 +1764,6 @@ cdef class Doc:
|
|||
output.fill(255)
|
||||
cdef int i, j, start_idx, end_idx
|
||||
cdef bytes byte_string
|
||||
cdef unsigned char utf8_char
|
||||
for i, byte_string in enumerate(byte_strings):
|
||||
j = 0
|
||||
start_idx = 0
|
||||
|
@ -1822,8 +1816,6 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2
|
|||
|
||||
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
|
||||
# note: end is exclusive
|
||||
cdef TokenC* head
|
||||
cdef TokenC* child
|
||||
cdef int i
|
||||
# Set number of left/right children to 0. We'll increment it in the loops.
|
||||
for i in range(start, end):
|
||||
|
@ -1923,7 +1915,7 @@ cdef int _get_tokens_lca(Token token_j, Token token_k):
|
|||
return -1
|
||||
|
||||
|
||||
cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
||||
cdef int [:, :] _get_lca_matrix(Doc doc, int start, int end):
|
||||
"""Given a doc and a start and end position defining a set of contiguous
|
||||
tokens within it, returns a matrix of Lowest Common Ancestors (LCA), where
|
||||
LCA[i, j] is the index of the lowest common ancestor among token i and j.
|
||||
|
@ -1936,7 +1928,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
|||
RETURNS (int [:, :]): memoryview of numpy.array[ndim=2, dtype=numpy.int32],
|
||||
with shape (n, n), where n = len(doc).
|
||||
"""
|
||||
cdef int [:,:] lca_matrix
|
||||
cdef int [:, :] lca_matrix
|
||||
cdef int j, k
|
||||
n_tokens= end - start
|
||||
lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
|
||||
|
|
|
@ -3,7 +3,7 @@ from typing import Generator, List, Tuple
|
|||
|
||||
cimport cython
|
||||
from cython.operator cimport dereference
|
||||
from libc.stdint cimport int32_t, int64_t
|
||||
from libc.stdint cimport int32_t
|
||||
from libcpp.pair cimport pair
|
||||
from libcpp.unordered_map cimport unordered_map
|
||||
from libcpp.unordered_set cimport unordered_set
|
||||
|
@ -11,7 +11,6 @@ from libcpp.unordered_set cimport unordered_set
|
|||
import weakref
|
||||
|
||||
from murmurhash.mrmr cimport hash64
|
||||
from preshed.maps cimport map_get_unless_missing
|
||||
|
||||
from .. import Errors
|
||||
|
||||
|
@ -28,7 +27,7 @@ from .token import Token
|
|||
cdef class Edge:
|
||||
cdef readonly Graph graph
|
||||
cdef readonly int i
|
||||
|
||||
|
||||
def __init__(self, Graph graph, int i):
|
||||
self.graph = graph
|
||||
self.i = i
|
||||
|
@ -44,7 +43,7 @@ cdef class Edge:
|
|||
@property
|
||||
def head(self) -> "Node":
|
||||
return Node(self.graph, self.graph.c.edges[self.i].head)
|
||||
|
||||
|
||||
@property
|
||||
def tail(self) -> "Tail":
|
||||
return Node(self.graph, self.graph.c.edges[self.i].tail)
|
||||
|
@ -70,7 +69,7 @@ cdef class Node:
|
|||
def __init__(self, Graph graph, int i):
|
||||
"""A reference to a node of an annotation graph. Each node is made up of
|
||||
an ordered set of zero or more token indices.
|
||||
|
||||
|
||||
Node references are usually created by the Graph object itself, or from
|
||||
the Node or Edge objects. You usually won't need to instantiate this
|
||||
class yourself.
|
||||
|
@ -109,13 +108,13 @@ cdef class Node:
|
|||
@property
|
||||
def is_none(self) -> bool:
|
||||
"""Whether the node is a special value, indicating 'none'.
|
||||
|
||||
|
||||
The NoneNode type is returned by the Graph, Edge and Node objects when
|
||||
there is no match to a query. It has the same API as Node, but it always
|
||||
returns NoneNode, NoneEdge or empty lists for its queries.
|
||||
"""
|
||||
return False
|
||||
|
||||
|
||||
@property
|
||||
def doc(self) -> "Doc":
|
||||
"""The Doc object that the graph refers to."""
|
||||
|
@ -130,19 +129,19 @@ cdef class Node:
|
|||
def head(self, i=None, label=None) -> "Node":
|
||||
"""Get the head of the first matching edge, searching by index, label,
|
||||
both or neither.
|
||||
|
||||
|
||||
For instance, `node.head(i=1)` will get the head of the second edge that
|
||||
this node is a tail of. `node.head(i=1, label="ARG0")` will further
|
||||
check that the second edge has the label `"ARG0"`.
|
||||
|
||||
|
||||
If no matching node can be found, the graph's NoneNode is returned.
|
||||
"""
|
||||
return self.headed(i=i, label=label)
|
||||
|
||||
|
||||
def tail(self, i=None, label=None) -> "Node":
|
||||
"""Get the tail of the first matching edge, searching by index, label,
|
||||
both or neither.
|
||||
|
||||
|
||||
If no matching node can be found, the graph's NoneNode is returned.
|
||||
"""
|
||||
return self.tailed(i=i, label=label).tail
|
||||
|
@ -171,7 +170,7 @@ cdef class Node:
|
|||
cdef vector[int] edge_indices
|
||||
self._find_edges(edge_indices, "head", label)
|
||||
return [Node(self.graph, self.graph.c.edges[i].head) for i in edge_indices]
|
||||
|
||||
|
||||
def tails(self, label=None) -> List["Node"]:
|
||||
"""Find all matching tails of this node."""
|
||||
cdef vector[int] edge_indices
|
||||
|
@ -200,7 +199,7 @@ cdef class Node:
|
|||
return NoneEdge(self.graph)
|
||||
else:
|
||||
return Edge(self.graph, idx)
|
||||
|
||||
|
||||
def tailed(self, i=None, label=None) -> Edge:
|
||||
"""Find the first matching edge tailed by this node.
|
||||
If no matching edge can be found, the graph's NoneEdge is returned.
|
||||
|
@ -283,7 +282,7 @@ cdef class NoneEdge(Edge):
|
|||
def __init__(self, graph):
|
||||
self.graph = graph
|
||||
self.i = -1
|
||||
|
||||
|
||||
@property
|
||||
def doc(self) -> "Doc":
|
||||
return self.graph.doc
|
||||
|
@ -291,7 +290,7 @@ cdef class NoneEdge(Edge):
|
|||
@property
|
||||
def head(self) -> "NoneNode":
|
||||
return NoneNode(self.graph)
|
||||
|
||||
|
||||
@property
|
||||
def tail(self) -> "NoneNode":
|
||||
return NoneNode(self.graph)
|
||||
|
@ -319,7 +318,7 @@ cdef class NoneNode(Node):
|
|||
|
||||
def __len__(self):
|
||||
return 0
|
||||
|
||||
|
||||
@property
|
||||
def is_none(self):
|
||||
return -1
|
||||
|
@ -340,14 +339,14 @@ cdef class NoneNode(Node):
|
|||
|
||||
def walk_heads(self):
|
||||
yield from []
|
||||
|
||||
|
||||
def walk_tails(self):
|
||||
yield from []
|
||||
|
||||
|
||||
|
||||
cdef class Graph:
|
||||
"""A set of directed labelled relationships between sets of tokens.
|
||||
|
||||
|
||||
EXAMPLE:
|
||||
Construction 1
|
||||
>>> graph = Graph(doc, name="srl")
|
||||
|
@ -372,7 +371,9 @@ cdef class Graph:
|
|||
>>> assert graph.has_node((0,))
|
||||
>>> assert graph.has_edge((0,), (1,3), label="agent")
|
||||
"""
|
||||
def __init__(self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None):
|
||||
def __init__(
|
||||
self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None # no-cython-lint
|
||||
):
|
||||
"""Create a Graph object.
|
||||
|
||||
doc (Doc): The Doc object the graph will refer to.
|
||||
|
@ -438,13 +439,11 @@ cdef class Graph:
|
|||
|
||||
def add_edge(self, head, tail, *, label="", weight=None) -> Edge:
|
||||
"""Add an edge to the graph, connecting two groups of tokens.
|
||||
|
||||
|
||||
If there is already an edge for the (head, tail, label) triple, it will
|
||||
be returned, and no new edge will be created. The weight of the edge
|
||||
will be updated if a weight is specified.
|
||||
"""
|
||||
label_hash = self.doc.vocab.strings.as_int(label)
|
||||
weight_float = weight if weight is not None else 0.0
|
||||
edge_index = add_edge(
|
||||
&self.c,
|
||||
EdgeC(
|
||||
|
@ -478,11 +477,11 @@ cdef class Graph:
|
|||
def has_edge(self, head, tail, label) -> bool:
|
||||
"""Check whether a (head, tail, label) triple is an edge in the graph."""
|
||||
return not self.get_edge(head, tail, label=label).is_none
|
||||
|
||||
|
||||
def add_node(self, indices) -> Node:
|
||||
"""Add a node to the graph and return it. Nodes refer to ordered sets
|
||||
of token indices.
|
||||
|
||||
|
||||
This method is idempotent: if there is already a node for the given
|
||||
indices, it is returned without a new node being created.
|
||||
"""
|
||||
|
@ -510,7 +509,7 @@ cdef class Graph:
|
|||
return NoneNode(self)
|
||||
else:
|
||||
return Node(self, node_index)
|
||||
|
||||
|
||||
def has_node(self, tuple indices) -> bool:
|
||||
"""Check whether the graph has a node for the given indices."""
|
||||
return not self.get_node(indices).is_none
|
||||
|
@ -570,7 +569,7 @@ cdef int add_node(GraphC* graph, vector[int32_t]& node) nogil:
|
|||
graph.roots.insert(index)
|
||||
graph.node_map.insert(pair[hash_t, int](key, index))
|
||||
return index
|
||||
|
||||
|
||||
|
||||
cdef int get_node(const GraphC* graph, vector[int32_t] node) nogil:
|
||||
key = hash64(&node[0], node.size() * sizeof(node[0]), 0)
|
||||
|
|
|
@ -89,4 +89,3 @@ cdef class MorphAnalysis:
|
|||
|
||||
def __repr__(self):
|
||||
return self.to_json()
|
||||
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
cimport numpy as np
|
||||
from libc.math cimport sqrt
|
||||
|
||||
import copy
|
||||
import warnings
|
||||
|
@ -10,11 +9,10 @@ from thinc.api import get_array_module
|
|||
from ..attrs cimport *
|
||||
from ..attrs cimport ORTH, attr_id_t
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..parts_of_speech cimport univ_pos_t
|
||||
from ..structs cimport LexemeC, TokenC
|
||||
from ..structs cimport TokenC
|
||||
from ..symbols cimport dep
|
||||
from ..typedefs cimport attr_t, flags_t, hash_t
|
||||
from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
|
||||
from ..typedefs cimport attr_t, hash_t
|
||||
from .doc cimport _get_lca_matrix, get_token_attr
|
||||
from .token cimport Token
|
||||
|
||||
from ..errors import Errors, Warnings
|
||||
|
@ -595,7 +593,6 @@ cdef class Span:
|
|||
"""
|
||||
return "".join([t.text_with_ws for t in self])
|
||||
|
||||
|
||||
@property
|
||||
def noun_chunks(self):
|
||||
"""Iterate over the base noun phrases in the span. Yields base
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import struct
|
||||
import weakref
|
||||
from copy import deepcopy
|
||||
from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Union
|
||||
from typing import Iterable, Optional, Union
|
||||
|
||||
import srsly
|
||||
|
||||
|
@ -34,7 +34,7 @@ cdef class SpanGroup:
|
|||
|
||||
DOCS: https://spacy.io/api/spangroup
|
||||
"""
|
||||
def __init__(self, doc, *, name="", attrs={}, spans=[]):
|
||||
def __init__(self, doc, *, name="", attrs={}, spans=[]): # no-cython-lint
|
||||
"""Create a SpanGroup.
|
||||
|
||||
doc (Doc): The reference Doc object.
|
||||
|
@ -311,7 +311,7 @@ cdef class SpanGroup:
|
|||
|
||||
other_attrs = deepcopy(other_group.attrs)
|
||||
span_group.attrs.update({
|
||||
key: value for key, value in other_attrs.items() \
|
||||
key: value for key, value in other_attrs.items()
|
||||
if key not in span_group.attrs
|
||||
})
|
||||
if len(other_group):
|
||||
|
|
|
@ -26,7 +26,7 @@ cdef class Token:
|
|||
cdef Token self = Token.__new__(Token, vocab, doc, offset)
|
||||
return self
|
||||
|
||||
#cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
|
||||
# cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
|
||||
# cdef TokenC token
|
||||
# attrs = normalize_attrs(attrs)
|
||||
|
||||
|
@ -98,12 +98,10 @@ cdef class Token:
|
|||
elif feat_name == SENT_START:
|
||||
token.sent_start = value
|
||||
|
||||
|
||||
@staticmethod
|
||||
cdef inline int missing_dep(const TokenC* token) nogil:
|
||||
return token.dep == MISSING_DEP
|
||||
|
||||
|
||||
@staticmethod
|
||||
cdef inline int missing_head(const TokenC* token) nogil:
|
||||
return Token.missing_dep(token)
|
||||
|
|
|
@ -1,13 +1,11 @@
|
|||
# cython: infer_types=True
|
||||
# Compiler crashes on memory view coercion without this. Should report bug.
|
||||
cimport numpy as np
|
||||
from cython.view cimport array as cvarray
|
||||
|
||||
np.import_array()
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy
|
||||
from thinc.api import get_array_module
|
||||
|
||||
from ..attrs cimport (
|
||||
|
@ -238,7 +236,7 @@ cdef class Token:
|
|||
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
||||
return result.item()
|
||||
|
||||
|
||||
def has_morph(self):
|
||||
"""Check whether the token has annotated morph information.
|
||||
Return False when the morph annotation is unset/missing.
|
||||
|
@ -545,9 +543,9 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
if self.i + 1 == len(self.doc):
|
||||
return True
|
||||
elif self.doc[self.i+1].is_sent_start == None:
|
||||
elif self.doc[self.i+1].is_sent_start is None:
|
||||
return None
|
||||
elif self.doc[self.i+1].is_sent_start == True:
|
||||
elif self.doc[self.i+1].is_sent_start is True:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
|
|
@ -37,10 +37,14 @@ def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[Li
|
|||
b2a.append(set())
|
||||
# Process the alignment at the current position
|
||||
if A[token_idx_a] == B[token_idx_b] and \
|
||||
(char_idx_a == 0 or \
|
||||
char_to_token_a[char_idx_a - 1] < token_idx_a) and \
|
||||
(char_idx_b == 0 or \
|
||||
char_to_token_b[char_idx_b - 1] < token_idx_b):
|
||||
(
|
||||
char_idx_a == 0 or
|
||||
char_to_token_a[char_idx_a - 1] < token_idx_a
|
||||
) and \
|
||||
(
|
||||
char_idx_b == 0 or
|
||||
char_to_token_b[char_idx_b - 1] < token_idx_b
|
||||
):
|
||||
# Current tokens are identical and both character offsets are the
|
||||
# start of a token (either at the beginning of the document or the
|
||||
# previous character belongs to a different token)
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import warnings
|
||||
from collections.abc import Iterable as IterableInstance
|
||||
|
||||
import numpy
|
||||
|
@ -31,9 +30,9 @@ cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
|
|||
attrs, array = _annot2array(vocab, tok_annot, doc_annot)
|
||||
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
|
||||
if "entities" in doc_annot:
|
||||
_add_entities_to_doc(output, doc_annot["entities"])
|
||||
_add_entities_to_doc(output, doc_annot["entities"])
|
||||
if "spans" in doc_annot:
|
||||
_add_spans_to_doc(output, doc_annot["spans"])
|
||||
_add_spans_to_doc(output, doc_annot["spans"])
|
||||
if array.size:
|
||||
output = output.from_array(attrs, array)
|
||||
# links are currently added with ENT_KB_ID on the token level
|
||||
|
@ -161,7 +160,6 @@ cdef class Example:
|
|||
self._y_sig = y_sig
|
||||
return self._cached_alignment
|
||||
|
||||
|
||||
def _get_aligned_vectorized(self, align, gold_values):
|
||||
# Fast path for Doc attributes/fields that are predominantly a single value,
|
||||
# i.e., TAG, POS, MORPH.
|
||||
|
@ -204,7 +202,6 @@ cdef class Example:
|
|||
|
||||
return output.tolist()
|
||||
|
||||
|
||||
def _get_aligned_non_vectorized(self, align, gold_values):
|
||||
# Slower path for fields that return multiple values (resulting
|
||||
# in ragged arrays that cannot be vectorized trivially).
|
||||
|
@ -221,7 +218,6 @@ cdef class Example:
|
|||
|
||||
return output
|
||||
|
||||
|
||||
def get_aligned(self, field, as_string=False):
|
||||
"""Return an aligned array for a token attribute."""
|
||||
align = self.alignment.x2y
|
||||
|
@ -330,7 +326,7 @@ cdef class Example:
|
|||
missing=None
|
||||
)
|
||||
# Now fill the tokens we can align to O.
|
||||
O = 2 # I=1, O=2, B=3
|
||||
O = 2 # I=1, O=2, B=3 # no-cython-lint: E741
|
||||
for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
|
||||
if x_tags[i] is None:
|
||||
if ent_iob == O:
|
||||
|
@ -340,7 +336,7 @@ cdef class Example:
|
|||
return x_ents, x_tags
|
||||
|
||||
def get_aligned_ner(self):
|
||||
x_ents, x_tags = self.get_aligned_ents_and_ner()
|
||||
_x_ents, x_tags = self.get_aligned_ents_and_ner()
|
||||
return x_tags
|
||||
|
||||
def get_matching_ents(self, check_label=True):
|
||||
|
@ -398,7 +394,6 @@ cdef class Example:
|
|||
|
||||
return span_dict
|
||||
|
||||
|
||||
def _links_to_dict(self):
|
||||
links = {}
|
||||
for ent in self.reference.ents:
|
||||
|
@ -589,6 +584,7 @@ def _fix_legacy_dict_data(example_dict):
|
|||
"doc_annotation": doc_dict
|
||||
}
|
||||
|
||||
|
||||
def _has_field(annot, field):
|
||||
if field not in annot:
|
||||
return False
|
||||
|
@ -625,6 +621,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
|
|||
ent_types.append("")
|
||||
return ent_iobs, ent_types
|
||||
|
||||
|
||||
def _parse_links(vocab, words, spaces, links):
|
||||
reference = Doc(vocab, words=words, spaces=spaces)
|
||||
starts = {token.idx: token.i for token in reference}
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import json
|
||||
import warnings
|
||||
|
||||
import srsly
|
||||
|
@ -6,7 +5,7 @@ import srsly
|
|||
from .. import util
|
||||
from ..errors import Warnings
|
||||
from ..tokens import Doc
|
||||
from .iob_utils import offsets_to_biluo_tags, tags_to_entities
|
||||
from .iob_utils import offsets_to_biluo_tags
|
||||
|
||||
|
||||
def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
||||
|
@ -23,7 +22,13 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
|||
json_doc = {"id": doc_id, "paragraphs": []}
|
||||
for i, doc in enumerate(docs):
|
||||
raw = None if doc.has_unknown_spaces else doc.text
|
||||
json_para = {'raw': raw, "sentences": [], "cats": [], "entities": [], "links": []}
|
||||
json_para = {
|
||||
'raw': raw,
|
||||
"sentences": [],
|
||||
"cats": [],
|
||||
"entities": [],
|
||||
"links": []
|
||||
}
|
||||
for cat, val in doc.cats.items():
|
||||
json_cat = {"label": cat, "value": val}
|
||||
json_para["cats"].append(json_cat)
|
||||
|
@ -35,13 +40,17 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
|||
if ent.kb_id_:
|
||||
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
|
||||
json_para["links"].append(link_dict)
|
||||
biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag)
|
||||
biluo_tags = offsets_to_biluo_tags(
|
||||
doc, json_para["entities"], missing=ner_missing_tag
|
||||
)
|
||||
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
|
||||
include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||
for j, sent in enumerate(doc.sents):
|
||||
json_sent = {"tokens": [], "brackets": []}
|
||||
for token in sent:
|
||||
json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
|
||||
json_token = {
|
||||
"id": token.i, "orth": token.text, "space": token.whitespace_
|
||||
}
|
||||
if include_annotation["TAG"]:
|
||||
json_token["tag"] = token.tag_
|
||||
if include_annotation["POS"]:
|
||||
|
@ -125,9 +134,14 @@ def json_to_annotations(doc):
|
|||
else:
|
||||
sent_starts.append(-1)
|
||||
if "brackets" in sent:
|
||||
brackets.extend((b["first"] + sent_start_i,
|
||||
b["last"] + sent_start_i, b["label"])
|
||||
for b in sent["brackets"])
|
||||
brackets.extend(
|
||||
(
|
||||
b["first"] + sent_start_i,
|
||||
b["last"] + sent_start_i,
|
||||
b["label"]
|
||||
)
|
||||
for b in sent["brackets"]
|
||||
)
|
||||
|
||||
example["token_annotation"] = dict(
|
||||
ids=ids,
|
||||
|
@ -160,6 +174,7 @@ def json_to_annotations(doc):
|
|||
)
|
||||
yield example
|
||||
|
||||
|
||||
def json_iterate(bytes utf8_str):
|
||||
# We should've made these files jsonl...But since we didn't, parse out
|
||||
# the docs one-by-one to reduce memory usage.
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
cimport numpy as np
|
||||
from cython.operator cimport dereference as deref
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
from libcpp.set cimport set as cppset
|
||||
from murmurhash.mrmr cimport hash128_x64
|
||||
|
||||
import functools
|
||||
import warnings
|
||||
from enum import Enum
|
||||
from typing import cast
|
||||
|
@ -119,7 +117,7 @@ cdef class Vectors:
|
|||
if self.mode == Mode.default:
|
||||
if data is None:
|
||||
if shape is None:
|
||||
shape = (0,0)
|
||||
shape = (0, 0)
|
||||
ops = get_current_ops()
|
||||
data = ops.xp.zeros(shape, dtype="f")
|
||||
self._unset = cppset[int]({i for i in range(data.shape[0])})
|
||||
|
@ -260,11 +258,10 @@ cdef class Vectors:
|
|||
def __eq__(self, other):
|
||||
# Check for equality, with faster checks first
|
||||
return (
|
||||
self.shape == other.shape
|
||||
and self.key2row == other.key2row
|
||||
and self.to_bytes(exclude=["strings"])
|
||||
== other.to_bytes(exclude=["strings"])
|
||||
)
|
||||
self.shape == other.shape
|
||||
and self.key2row == other.key2row
|
||||
and self.to_bytes(exclude=["strings"]) == other.to_bytes(exclude=["strings"])
|
||||
)
|
||||
|
||||
def resize(self, shape, inplace=False):
|
||||
"""Resize the underlying vectors array. If inplace=True, the memory
|
||||
|
@ -520,11 +517,12 @@ cdef class Vectors:
|
|||
# vectors e.g. (10000, 300)
|
||||
# sims e.g. (1024, 10000)
|
||||
sims = xp.dot(batch, vectors.T)
|
||||
best_rows[i:i+batch_size] = xp.argpartition(sims, -n, axis=1)[:,-n:]
|
||||
scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:,-n:]
|
||||
best_rows[i:i+batch_size] = xp.argpartition(sims, -n, axis=1)[:, -n:]
|
||||
scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:, -n:]
|
||||
|
||||
if sort and n >= 2:
|
||||
sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1]
|
||||
sorted_index = xp.arange(scores.shape[0])[:, None][i:i+batch_size], \
|
||||
xp.argsort(scores[i:i+batch_size], axis=1)[:, ::-1]
|
||||
scores[i:i+batch_size] = scores[sorted_index]
|
||||
best_rows[i:i+batch_size] = best_rows[sorted_index]
|
||||
|
||||
|
@ -538,8 +536,12 @@ cdef class Vectors:
|
|||
|
||||
numpy_rows = get_current_ops().to_numpy(best_rows)
|
||||
keys = xp.asarray(
|
||||
[[row2key[row] for row in numpy_rows[i] if row in row2key]
|
||||
for i in range(len(queries)) ], dtype="uint64")
|
||||
[
|
||||
[row2key[row] for row in numpy_rows[i] if row in row2key]
|
||||
for i in range(len(queries))
|
||||
],
|
||||
dtype="uint64"
|
||||
)
|
||||
return (keys, best_rows, scores)
|
||||
|
||||
def to_ops(self, ops: Ops):
|
||||
|
@ -582,9 +584,9 @@ cdef class Vectors:
|
|||
"""
|
||||
xp = get_array_module(self.data)
|
||||
if xp is numpy:
|
||||
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
|
||||
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) # no-cython-lint
|
||||
else:
|
||||
save_array = lambda arr, file_: xp.save(file_, arr)
|
||||
save_array = lambda arr, file_: xp.save(file_, arr) # no-cython-lint
|
||||
|
||||
def save_vectors(path):
|
||||
# the source of numpy.save indicates that the file object is closed after use.
|
||||
|
|
|
@ -32,7 +32,7 @@ cdef class Vocab:
|
|||
cdef public object writing_system
|
||||
cdef public object get_noun_chunks
|
||||
cdef readonly int length
|
||||
cdef public object _unused_object # TODO remove in v4, see #9150
|
||||
cdef public object _unused_object # TODO remove in v4, see #9150
|
||||
cdef public object lex_attr_getters
|
||||
cdef public object cfg
|
||||
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
# cython: profile=True
|
||||
from libc.string cimport memcpy
|
||||
|
||||
import functools
|
||||
|
||||
import numpy
|
||||
|
@ -19,7 +17,6 @@ from .errors import Errors
|
|||
from .lang.lex_attrs import LEX_ATTRS, get_lang, is_stop
|
||||
from .lang.norm_exceptions import BASE_NORMS
|
||||
from .lookups import Lookups
|
||||
from .util import registry
|
||||
from .vectors import Mode as VectorsMode
|
||||
from .vectors import Vectors
|
||||
|
||||
|
@ -51,9 +48,17 @@ cdef class Vocab:
|
|||
|
||||
DOCS: https://spacy.io/api/vocab
|
||||
"""
|
||||
def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
|
||||
oov_prob=-20., vectors_name=None, writing_system={},
|
||||
get_noun_chunks=None, **deprecated_kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
lex_attr_getters=None,
|
||||
strings=tuple(),
|
||||
lookups=None,
|
||||
oov_prob=-20.,
|
||||
vectors_name=None,
|
||||
writing_system={}, # no-cython-lint
|
||||
get_noun_chunks=None,
|
||||
**deprecated_kwargs
|
||||
):
|
||||
"""Create the vocabulary.
|
||||
|
||||
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
||||
|
@ -150,7 +155,6 @@ cdef class Vocab:
|
|||
cdef LexemeC* lex
|
||||
cdef hash_t key = self.strings[string]
|
||||
lex = <LexemeC*>self._by_orth.get(key)
|
||||
cdef size_t addr
|
||||
if lex != NULL:
|
||||
assert lex.orth in self.strings
|
||||
if lex.orth != key:
|
||||
|
@ -183,7 +187,7 @@ cdef class Vocab:
|
|||
# of the doc ownership).
|
||||
# TODO: Change the C API so that the mem isn't passed in here.
|
||||
mem = self.mem
|
||||
#if len(string) < 3 or self.length < 10000:
|
||||
# if len(string) < 3 or self.length < 10000:
|
||||
# mem = self.mem
|
||||
cdef bint is_oov = mem is not self.mem
|
||||
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
|
||||
|
@ -463,7 +467,6 @@ cdef class Vocab:
|
|||
self.lookups.get_table("lexeme_norm"),
|
||||
)
|
||||
|
||||
|
||||
def to_disk(self, path, *, exclude=tuple()):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
|
@ -476,7 +479,6 @@ cdef class Vocab:
|
|||
path = util.ensure_path(path)
|
||||
if not path.exists():
|
||||
path.mkdir()
|
||||
setters = ["strings", "vectors"]
|
||||
if "strings" not in exclude:
|
||||
self.strings.to_disk(path / "strings.json")
|
||||
if "vectors" not in exclude:
|
||||
|
@ -495,7 +497,6 @@ cdef class Vocab:
|
|||
DOCS: https://spacy.io/api/vocab#to_disk
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
getters = ["strings", "vectors"]
|
||||
if "strings" not in exclude:
|
||||
self.strings.from_disk(path / "strings.json") # TODO: add exclude?
|
||||
if "vectors" not in exclude:
|
||||
|
|
|
@ -856,7 +856,7 @@ token-to-vector embedding component like [`Tok2Vec`](/api/tok2vec) or
|
|||
training a pipeline with components sourced from an existing pipeline: if
|
||||
multiple components (e.g. tagger, parser, NER) listen to the same
|
||||
token-to-vector component, but some of them are frozen and not updated, their
|
||||
performance may degrade significally as the token-to-vector component is updated
|
||||
performance may degrade significantly as the token-to-vector component is updated
|
||||
with new data. To prevent this, listeners can be replaced with a standalone
|
||||
token-to-vector layer that is owned by the component and doesn't change if the
|
||||
component isn't updated.
|
||||
|
|
|
@ -60,7 +60,7 @@ architectures and their arguments and hyperparameters.
|
|||
| `model` | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~ |
|
||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||
| `threshold` | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~ |
|
||||
| `max_length` | Maximum length of the produced spans, defaults to `None` meaning unlimited length. ~~Optional[int]~~ |
|
||||
| `max_length` | Maximum length of the produced spans, defaults to `25`. ~~Optional[int]~~ |
|
||||
| `min_length` | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~ |
|
||||
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
||||
|
||||
|
|
|
@ -261,7 +261,7 @@ source code and recompiling frequently.
|
|||
|
||||
#### Visual Studio Code extension
|
||||
|
||||
![spaCy extension demo](/images/spacy-extension-demo.gif)
|
||||
![spaCy extension demo](/images/spacy-extension-demo.gif)
|
||||
|
||||
The [spaCy VSCode Extension](https://github.com/explosion/spacy-vscode) provides
|
||||
additional tooling and features for working with spaCy's config files. Version
|
||||
|
@ -310,7 +310,7 @@ You can configure the build process with the following environment variables:
|
|||
| Variable | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `SPACY_EXTRAS` | Additional Python packages to install alongside spaCy with optional version specifications. Should be a string that can be passed to `pip install`. See [`Makefile`](%%GITHUB_SPACY/Makefile) for defaults. |
|
||||
| `PYVER` | The Python version to build against. This version needs to be available on your build and runtime machines. Defaults to `3.6`. |
|
||||
| `PYVER` | The Python version to build against. This version needs to be available on your build and runtime machines. Defaults to `3.8`. |
|
||||
| `WHEELHOUSE` | Directory to store the wheel files during compilation. Defaults to `./wheelhouse`. |
|
||||
|
||||
### Run tests {id="run-tests"}
|
||||
|
|
|
@ -113,7 +113,7 @@ print(doc[2].morph) # 'Case=Nom|Person=2|PronType=Prs'
|
|||
print(doc[2].pos_) # 'PRON'
|
||||
```
|
||||
|
||||
## Lemmatization {id="lemmatization",model="lemmatizer",version="3"}
|
||||
## Lemmatization {id="lemmatization",version="3"}
|
||||
|
||||
spaCy provides two pipeline components for lemmatization:
|
||||
|
||||
|
@ -170,7 +170,7 @@ nlp = spacy.blank("sv")
|
|||
nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||
```
|
||||
|
||||
### Rule-based lemmatizer {id="lemmatizer-rule"}
|
||||
### Rule-based lemmatizer {id="lemmatizer-rule",model="morphologizer"}
|
||||
|
||||
When training pipelines that include a component that assigns part-of-speech
|
||||
tags (a morphologizer or a tagger with a [POS mapping](#mappings-exceptions)), a
|
||||
|
@ -194,7 +194,7 @@ information, without consulting the context of the token. The rule-based
|
|||
lemmatizer also accepts list-based exception files. For English, these are
|
||||
acquired from [WordNet](https://wordnet.princeton.edu/).
|
||||
|
||||
### Trainable lemmatizer
|
||||
### Trainable lemmatizer {id="lemmatizer-train",model="trainable_lemmatizer"}
|
||||
|
||||
The [`EditTreeLemmatizer`](/api/edittreelemmatizer) can learn form-to-lemma
|
||||
transformations from a training corpus that includes lemma annotations. This
|
||||
|
|
|
@ -27,7 +27,7 @@
|
|||
"indexName": "spacy"
|
||||
},
|
||||
"binderUrl": "explosion/spacy-io-binder",
|
||||
"binderVersion": "3.5",
|
||||
"binderVersion": "3.6",
|
||||
"sections": [
|
||||
{ "id": "usage", "title": "Usage Documentation", "theme": "blue" },
|
||||
{ "id": "models", "title": "Models Documentation", "theme": "blue" },
|
||||
|
|
Loading…
Reference in New Issue
Block a user