Merge branch 'upstream_master' into sync_v4

This commit is contained in:
svlandeg 2023-07-19 16:37:31 +02:00
commit 0e3b6a87d6
82 changed files with 1201 additions and 687 deletions

View File

@ -45,6 +45,12 @@ jobs:
run: | run: |
python -m pip install flake8==5.0.4 python -m pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
- name: cython-lint
run: |
python -m pip install cython-lint -c requirements.txt
# E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
cython-lint spacy --ignore E501,W291,E266
tests: tests:
name: Test name: Test
needs: Validate needs: Validate

View File

@ -1,7 +1,7 @@
SHELL := /bin/bash SHELL := /bin/bash
ifndef SPACY_EXTRAS ifndef SPACY_EXTRAS
override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2 override SPACY_EXTRAS = spacy-lookups-data==1.0.3
endif endif
ifndef PYVER ifndef PYVER

View File

@ -36,4 +36,5 @@ types-setuptools>=57.0.0
types-requests types-requests
types-setuptools>=57.0.0 types-setuptools>=57.0.0
black==22.3.0 black==22.3.0
cython-lint>=0.15.0; python_version >= "3.7"
isort>=5.0,<6.0 isort>=5.0,<6.0

View File

@ -47,4 +47,5 @@ cdef enum attr_id_t:
MORPH = symbols.MORPH MORPH = symbols.MORPH
ENT_ID = symbols.ENT_ID ENT_ID = symbols.ENT_ID
IDX = symbols.IDX IDX
SENT_END

View File

@ -32,6 +32,7 @@ def init_vectors_cli(
mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"), mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True), jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
# fmt: on # fmt: on
): ):
"""Convert word vectors for use with spaCy. Will export an nlp object that """Convert word vectors for use with spaCy. Will export an nlp object that
@ -53,6 +54,7 @@ def init_vectors_cli(
truncate=truncate, truncate=truncate,
prune=prune, prune=prune,
mode=mode, mode=mode,
attr=attr,
) )
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
nlp.to_disk(output_dir) nlp.to_disk(output_dir)

View File

@ -128,7 +128,7 @@ grad_factor = 1.0
{% if "span_finder" in components -%} {% if "span_finder" in components -%}
[components.span_finder] [components.span_finder]
factory = "span_finder" factory = "span_finder"
max_length = null max_length = 25
min_length = null min_length = null
scorer = {"@scorers":"spacy.span_finder_scorer.v1"} scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
spans_key = "sc" spans_key = "sc"
@ -415,7 +415,7 @@ width = ${components.tok2vec.model.encode.width}
{% if "span_finder" in components %} {% if "span_finder" in components %}
[components.span_finder] [components.span_finder]
factory = "span_finder" factory = "span_finder"
max_length = null max_length = 25
min_length = null min_length = null
scorer = {"@scorers":"spacy.span_finder_scorer.v1"} scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
spans_key = "sc" spans_key = "sc"

View File

@ -1,4 +1,3 @@
import itertools
import uuid import uuid
from typing import Any, Dict, List, Optional, Tuple, Union from typing import Any, Dict, List, Optional, Tuple, Union
@ -218,7 +217,7 @@ class SpanRenderer:
+ (self.offset_step * (len(entities) - 1)) + (self.offset_step * (len(entities) - 1))
) )
markup += self.span_template.format( markup += self.span_template.format(
text=token["text"], text=escape_html(token["text"]),
span_slices=slices, span_slices=slices,
span_starts=starts, span_starts=starts,
total_height=total_height, total_height=total_height,

View File

@ -208,6 +208,9 @@ class Warnings(metaclass=ErrorsWithCodes):
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option " W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.") W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
"key attribute for vectors, configure it through Vectors(attr=) or "
"'spacy init vectors --attr'")
# v4 warning strings # v4 warning strings
W400 = ("`use_upper=False` is ignored, the upper layer is always enabled") W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")

View File

@ -12,8 +12,9 @@ from .candidate import Candidate
cdef class KnowledgeBase: cdef class KnowledgeBase:
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases, """A `KnowledgeBase` instance stores unique identifiers for entities and
to support entity linking of named entities to real-world concepts. their textual aliases, to support entity linking of named entities to
real-world concepts.
This is an abstract class and requires its operations to be implemented. This is an abstract class and requires its operations to be implemented.
DOCS: https://spacy.io/api/kb DOCS: https://spacy.io/api/kb
@ -31,7 +32,9 @@ cdef class KnowledgeBase:
self.entity_vector_length = entity_vector_length self.entity_vector_length = entity_vector_length
self.mem = Pool() self.mem = Pool()
def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]: def get_candidates_batch(
self, mentions: SpanGroup
) -> Iterable[Iterable[Candidate]]:
""" """
Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
entity's embedding vector. Depending on the KB implementation, further properties - such as the prior entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
@ -52,7 +55,9 @@ cdef class KnowledgeBase:
RETURNS (Iterable[Candidate]): Identified candidates. RETURNS (Iterable[Candidate]): Identified candidates.
""" """
raise NotImplementedError( raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__) Errors.E1045.format(
parent="KnowledgeBase", method="get_candidates", name=self.__name__
)
) )
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]: def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
@ -70,7 +75,9 @@ cdef class KnowledgeBase:
RETURNS (Iterable[float]): Vector for specified entity. RETURNS (Iterable[float]): Vector for specified entity.
""" """
raise NotImplementedError( raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__) Errors.E1045.format(
parent="KnowledgeBase", method="get_vector", name=self.__name__
)
) )
def to_bytes(self, **kwargs) -> bytes: def to_bytes(self, **kwargs) -> bytes:
@ -78,7 +85,9 @@ cdef class KnowledgeBase:
RETURNS (bytes): Current state as binary string. RETURNS (bytes): Current state as binary string.
""" """
raise NotImplementedError( raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__) Errors.E1045.format(
parent="KnowledgeBase", method="to_bytes", name=self.__name__
)
) )
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()): def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
@ -87,27 +96,37 @@ cdef class KnowledgeBase:
exclude (Tuple[str]): Properties to exclude when restoring KB. exclude (Tuple[str]): Properties to exclude when restoring KB.
""" """
raise NotImplementedError( raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__) Errors.E1045.format(
parent="KnowledgeBase", method="from_bytes", name=self.__name__
)
) )
def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None: def to_disk(
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
) -> None:
""" """
Write KnowledgeBase content to disk. Write KnowledgeBase content to disk.
path (Union[str, Path]): Target file path. path (Union[str, Path]): Target file path.
exclude (Iterable[str]): List of components to exclude. exclude (Iterable[str]): List of components to exclude.
""" """
raise NotImplementedError( raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__) Errors.E1045.format(
parent="KnowledgeBase", method="to_disk", name=self.__name__
)
) )
def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None: def from_disk(
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
) -> None:
""" """
Load KnowledgeBase content from disk. Load KnowledgeBase content from disk.
path (Union[str, Path]): Target file path. path (Union[str, Path]): Target file path.
exclude (Iterable[str]): List of components to exclude. exclude (Iterable[str]): List of components to exclude.
""" """
raise NotImplementedError( raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) Errors.E1045.format(
parent="KnowledgeBase", method="from_disk", name=self.__name__
)
) )
@property @property

View File

@ -55,23 +55,28 @@ cdef class InMemoryLookupKB(KnowledgeBase):
# optional data, we can let users configure a DB as the backend for this. # optional data, we can let users configure a DB as the backend for this.
cdef object _features_table cdef object _features_table
cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil: cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
"""Add an entity vector to the vectors table.""" """Add an entity vector to the vectors table."""
cdef int64_t new_index = self._vectors_table.size() cdef int64_t new_index = self._vectors_table.size()
self._vectors_table.push_back(entity_vector) self._vectors_table.push_back(entity_vector)
return new_index return new_index
cdef inline int64_t c_add_entity(
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq, self,
int32_t vector_index, int feats_row) nogil: hash_t entity_hash,
float freq,
int32_t vector_index,
int feats_row
) nogil:
"""Add an entry to the vector of entries. """Add an entry to the vector of entries.
After calling this method, make sure to update also the _entry_index using the return value""" After calling this method, make sure to update also the _entry_index
using the return value"""
# This is what we'll map the entity hash key to. It's where the entry will sit # This is what we'll map the entity hash key to. It's where the entry will sit
# in the vector of entries, so we can get it later. # in the vector of entries, so we can get it later.
cdef int64_t new_index = self._entries.size() cdef int64_t new_index = self._entries.size()
# Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642 # Avoid struct initializer to enable nogil, cf.
# https://github.com/cython/cython/issues/1642
cdef KBEntryC entry cdef KBEntryC entry
entry.entity_hash = entity_hash entry.entity_hash = entity_hash
entry.vector_index = vector_index entry.vector_index = vector_index
@ -81,11 +86,17 @@ cdef class InMemoryLookupKB(KnowledgeBase):
self._entries.push_back(entry) self._entries.push_back(entry)
return new_index return new_index
cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil: cdef inline int64_t c_add_aliases(
"""Connect a mention to a list of potential entities with their prior probabilities . self,
After calling this method, make sure to update also the _alias_index using the return value""" hash_t alias_hash,
# This is what we'll map the alias hash key to. It's where the alias will be defined vector[int64_t] entry_indices,
# in the vector of aliases. vector[float] probs
) nogil:
"""Connect a mention to a list of potential entities with their prior
probabilities. After calling this method, make sure to update also the
_alias_index using the return value"""
# This is what we'll map the alias hash key to. It's where the alias will be
# defined in the vector of aliases.
cdef int64_t new_index = self._aliases_table.size() cdef int64_t new_index = self._aliases_table.size()
# Avoid struct initializer to enable nogil # Avoid struct initializer to enable nogil
@ -98,8 +109,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil: cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
""" """
Initializing the vectors and making sure the first element of each vector is a dummy, Initializing the vectors and making sure the first element of each vector is a
because the PreshMap maps pointing to indices in these vectors can not contain 0 as value dummy, because the PreshMap maps pointing to indices in these vectors can not
contain 0 as value.
cf. https://github.com/explosion/preshed/issues/17 cf. https://github.com/explosion/preshed/issues/17
""" """
cdef int32_t dummy_value = 0 cdef int32_t dummy_value = 0
@ -130,12 +142,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
cdef class Writer: cdef class Writer:
cdef FILE* _fp cdef FILE* _fp
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1 cdef int write_header(
self, int64_t nr_entries, int64_t entity_vector_length
) except -1
cdef int write_vector_element(self, float element) except -1 cdef int write_vector_element(self, float element) except -1
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1 cdef int write_entry(
self, hash_t entry_hash, float entry_freq, int32_t vector_index
) except -1
cdef int write_alias_length(self, int64_t alias_length) except -1 cdef int write_alias_length(self, int64_t alias_length) except -1
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1 cdef int write_alias_header(
self, hash_t alias_hash, int64_t candidate_length
) except -1
cdef int write_alias(self, int64_t entry_index, float prob) except -1 cdef int write_alias(self, int64_t entry_index, float prob) except -1
cdef int _write(self, void* value, size_t size) except -1 cdef int _write(self, void* value, size_t size) except -1
@ -143,12 +161,18 @@ cdef class Writer:
cdef class Reader: cdef class Reader:
cdef FILE* _fp cdef FILE* _fp
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1 cdef int read_header(
self, int64_t* nr_entries, int64_t* entity_vector_length
) except -1
cdef int read_vector_element(self, float* element) except -1 cdef int read_vector_element(self, float* element) except -1
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1 cdef int read_entry(
self, hash_t* entity_hash, float* freq, int32_t* vector_index
) except -1
cdef int read_alias_length(self, int64_t* alias_length) except -1 cdef int read_alias_length(self, int64_t* alias_length) except -1
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1 cdef int read_alias_header(
self, hash_t* alias_hash, int64_t* candidate_length
) except -1
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1 cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
cdef int _read(self, void* value, size_t size) except -1 cdef int _read(self, void* value, size_t size) except -1

View File

@ -1,5 +1,5 @@
# cython: infer_types=True, profile=True # cython: infer_types=True, profile=True
from typing import Any, Callable, Dict, Iterable, Union from typing import Any, Callable, Dict, Iterable
import srsly import srsly
@ -27,8 +27,9 @@ from .candidate import InMemoryCandidate
cdef class InMemoryLookupKB(KnowledgeBase): cdef class InMemoryLookupKB(KnowledgeBase):
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases, """An `InMemoryLookupKB` instance stores unique identifiers for entities
to support entity linking of named entities to real-world concepts. and their textual aliases, to support entity linking of named entities to
real-world concepts.
DOCS: https://spacy.io/api/inmemorylookupkb DOCS: https://spacy.io/api/inmemorylookupkb
""" """
@ -71,7 +72,8 @@ cdef class InMemoryLookupKB(KnowledgeBase):
def add_entity(self, str entity, float freq, vector[float] entity_vector): def add_entity(self, str entity, float freq, vector[float] entity_vector):
""" """
Add an entity to the KB, optionally specifying its log probability based on corpus frequency Add an entity to the KB, optionally specifying its log probability
based on corpus frequency.
Return the hash of the entity ID/name at the end. Return the hash of the entity ID/name at the end.
""" """
cdef hash_t entity_hash = self.vocab.strings.add(entity) cdef hash_t entity_hash = self.vocab.strings.add(entity)
@ -83,14 +85,20 @@ cdef class InMemoryLookupKB(KnowledgeBase):
# Raise an error if the provided entity vector is not of the correct length # Raise an error if the provided entity vector is not of the correct length
if len(entity_vector) != self.entity_vector_length: if len(entity_vector) != self.entity_vector_length:
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length)) raise ValueError(
Errors.E141.format(
found=len(entity_vector), required=self.entity_vector_length
)
)
vector_index = self.c_add_vector(entity_vector=entity_vector) vector_index = self.c_add_vector(entity_vector=entity_vector)
new_index = self.c_add_entity(entity_hash=entity_hash, new_index = self.c_add_entity(
entity_hash=entity_hash,
freq=freq, freq=freq,
vector_index=vector_index, vector_index=vector_index,
feats_row=-1) # Features table currently not implemented feats_row=-1
) # Features table currently not implemented
self._entry_index[entity_hash] = new_index self._entry_index[entity_hash] = new_index
return entity_hash return entity_hash
@ -115,7 +123,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
else: else:
entity_vector = vector_list[i] entity_vector = vector_list[i]
if len(entity_vector) != self.entity_vector_length: if len(entity_vector) != self.entity_vector_length:
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length)) raise ValueError(
Errors.E141.format(
found=len(entity_vector),
required=self.entity_vector_length
)
)
entry.entity_hash = entity_hash entry.entity_hash = entity_hash
entry.freq = freq_list[i] entry.freq = freq_list[i]
@ -149,11 +162,15 @@ cdef class InMemoryLookupKB(KnowledgeBase):
previous_alias_nr = self.get_size_aliases() previous_alias_nr = self.get_size_aliases()
# Throw an error if the length of entities and probabilities are not the same # Throw an error if the length of entities and probabilities are not the same
if not len(entities) == len(probabilities): if not len(entities) == len(probabilities):
raise ValueError(Errors.E132.format(alias=alias, raise ValueError(
Errors.E132.format(
alias=alias,
entities_length=len(entities), entities_length=len(entities),
probabilities_length=len(probabilities))) probabilities_length=len(probabilities))
)
# Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors) # Throw an error if the probabilities sum up to more than 1 (allow for
# some rounding errors)
prob_sum = sum(probabilities) prob_sum = sum(probabilities)
if prob_sum > 1.00001: if prob_sum > 1.00001:
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum)) raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
@ -170,40 +187,47 @@ cdef class InMemoryLookupKB(KnowledgeBase):
for entity, prob in zip(entities, probabilities): for entity, prob in zip(entities, probabilities):
entity_hash = self.vocab.strings[entity] entity_hash = self.vocab.strings[entity]
if not entity_hash in self._entry_index: if entity_hash not in self._entry_index:
raise ValueError(Errors.E134.format(entity=entity)) raise ValueError(Errors.E134.format(entity=entity))
entry_index = <int64_t>self._entry_index.get(entity_hash) entry_index = <int64_t>self._entry_index.get(entity_hash)
entry_indices.push_back(int(entry_index)) entry_indices.push_back(int(entry_index))
probs.push_back(float(prob)) probs.push_back(float(prob))
new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs) new_index = self.c_add_aliases(
alias_hash=alias_hash, entry_indices=entry_indices, probs=probs
)
self._alias_index[alias_hash] = new_index self._alias_index[alias_hash] = new_index
if previous_alias_nr + 1 != self.get_size_aliases(): if previous_alias_nr + 1 != self.get_size_aliases():
raise RuntimeError(Errors.E891.format(alias=alias)) raise RuntimeError(Errors.E891.format(alias=alias))
return alias_hash return alias_hash
def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False): def append_alias(
self, str alias, str entity, float prior_prob, ignore_warnings=False
):
""" """
For an alias already existing in the KB, extend its potential entities with one more. For an alias already existing in the KB, extend its potential entities
with one more.
Throw a warning if either the alias or the entity is unknown, Throw a warning if either the alias or the entity is unknown,
or when the combination is already previously recorded. or when the combination is already previously recorded.
Throw an error if this entity+prior prob would exceed the sum of 1. Throw an error if this entity+prior prob would exceed the sum of 1.
For efficiency, it's best to use the method `add_alias` as much as possible instead of this one. For efficiency, it's best to use the method `add_alias` as much as
possible instead of this one.
""" """
# Check if the alias exists in the KB # Check if the alias exists in the KB
cdef hash_t alias_hash = self.vocab.strings[alias] cdef hash_t alias_hash = self.vocab.strings[alias]
if not alias_hash in self._alias_index: if alias_hash not in self._alias_index:
raise ValueError(Errors.E176.format(alias=alias)) raise ValueError(Errors.E176.format(alias=alias))
# Check if the entity exists in the KB # Check if the entity exists in the KB
cdef hash_t entity_hash = self.vocab.strings[entity] cdef hash_t entity_hash = self.vocab.strings[entity]
if not entity_hash in self._entry_index: if entity_hash not in self._entry_index:
raise ValueError(Errors.E134.format(entity=entity)) raise ValueError(Errors.E134.format(entity=entity))
entry_index = <int64_t>self._entry_index.get(entity_hash) entry_index = <int64_t>self._entry_index.get(entity_hash)
# Throw an error if the prior probabilities (including the new one) sum up to more than 1 # Throw an error if the prior probabilities (including the new one)
# sum up to more than 1
alias_index = <int64_t>self._alias_index.get(alias_hash) alias_index = <int64_t>self._alias_index.get(alias_hash)
alias_entry = self._aliases_table[alias_index] alias_entry = self._aliases_table[alias_index]
current_sum = sum([p for p in alias_entry.probs]) current_sum = sum([p for p in alias_entry.probs])
@ -236,12 +260,13 @@ cdef class InMemoryLookupKB(KnowledgeBase):
def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]: def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
""" """
Return candidate entities for an alias. Each candidate defines the entity, the original alias, Return candidate entities for an alias. Each candidate defines the
and the prior probability of that alias resolving to that entity. entity, the original alias, and the prior probability of that alias
resolving to that entity.
If the alias is not known in the KB, and empty list is returned. If the alias is not known in the KB, and empty list is returned.
""" """
cdef hash_t alias_hash = self.vocab.strings[alias] cdef hash_t alias_hash = self.vocab.strings[alias]
if not alias_hash in self._alias_index: if alias_hash not in self._alias_index:
return [] return []
alias_index = <int64_t>self._alias_index.get(alias_hash) alias_index = <int64_t>self._alias_index.get(alias_hash)
alias_entry = self._aliases_table[alias_index] alias_entry = self._aliases_table[alias_index]
@ -270,8 +295,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
return self._vectors_table[self._entries[entry_index].vector_index] return self._vectors_table[self._entries[entry_index].vector_index]
def get_prior_prob(self, str entity, str alias): def get_prior_prob(self, str entity, str alias):
""" Return the prior probability of a given alias being linked to a given entity, """ Return the prior probability of a given alias being linked to a
or return 0.0 when this combination is not known in the knowledge base""" given entity, or return 0.0 when this combination is not known in the
knowledge base."""
cdef hash_t alias_hash = self.vocab.strings[alias] cdef hash_t alias_hash = self.vocab.strings[alias]
cdef hash_t entity_hash = self.vocab.strings[entity] cdef hash_t entity_hash = self.vocab.strings[entity]
@ -282,7 +308,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
entry_index = self._entry_index[entity_hash] entry_index = self._entry_index[entity_hash]
alias_entry = self._aliases_table[alias_index] alias_entry = self._aliases_table[alias_index]
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs): for (entry_index, prior_prob) in zip(
alias_entry.entry_indices, alias_entry.probs
):
if self._entries[entry_index].entity_hash == entity_hash: if self._entries[entry_index].entity_hash == entity_hash:
return prior_prob return prior_prob
@ -295,13 +323,19 @@ cdef class InMemoryLookupKB(KnowledgeBase):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
""" """
def serialize_header(): def serialize_header():
header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length) header = (
self.get_size_entities(),
self.get_size_aliases(),
self.entity_vector_length
)
return srsly.json_dumps(header) return srsly.json_dumps(header)
def serialize_entries(): def serialize_entries():
i = 1 i = 1
tuples = [] tuples = []
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]): for entry_hash, entry_index in sorted(
self._entry_index.items(), key=lambda x: x[1]
):
entry = self._entries[entry_index] entry = self._entries[entry_index]
assert entry.entity_hash == entry_hash assert entry.entity_hash == entry_hash
assert entry_index == i assert entry_index == i
@ -314,7 +348,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
headers = [] headers = []
indices_lists = [] indices_lists = []
probs_lists = [] probs_lists = []
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]): for alias_hash, alias_index in sorted(
self._alias_index.items(), key=lambda x: x[1]
):
alias = self._aliases_table[alias_index] alias = self._aliases_table[alias_index]
assert alias_index == i assert alias_index == i
candidate_length = len(alias.entry_indices) candidate_length = len(alias.entry_indices)
@ -372,7 +408,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
indices = srsly.json_loads(all_data[1]) indices = srsly.json_loads(all_data[1])
probs = srsly.json_loads(all_data[2]) probs = srsly.json_loads(all_data[2])
for header, indices, probs in zip(headers, indices, probs): for header, indices, probs in zip(headers, indices, probs):
alias_hash, candidate_length = header alias_hash, _candidate_length = header
alias.entry_indices = indices alias.entry_indices = indices
alias.probs = probs alias.probs = probs
self._aliases_table[i] = alias self._aliases_table[i] = alias
@ -421,10 +457,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
writer.write_vector_element(element) writer.write_vector_element(element)
i = i+1 i = i+1
# dumping the entry records in the order in which they are in the _entries vector. # dumping the entry records in the order in which they are in the
# index 0 is a dummy object not stored in the _entry_index and can be ignored. # _entries vector.
# index 0 is a dummy object not stored in the _entry_index and can
# be ignored.
i = 1 i = 1
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]): for entry_hash, entry_index in sorted(
self._entry_index.items(), key=lambda x: x[1]
):
entry = self._entries[entry_index] entry = self._entries[entry_index]
assert entry.entity_hash == entry_hash assert entry.entity_hash == entry_hash
assert entry_index == i assert entry_index == i
@ -436,7 +476,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
# dumping the aliases in the order in which they are in the _alias_index vector. # dumping the aliases in the order in which they are in the _alias_index vector.
# index 0 is a dummy object not stored in the _aliases_table and can be ignored. # index 0 is a dummy object not stored in the _aliases_table and can be ignored.
i = 1 i = 1
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]): for alias_hash, alias_index in sorted(
self._alias_index.items(), key=lambda x: x[1]
):
alias = self._aliases_table[alias_index] alias = self._aliases_table[alias_index]
assert alias_index == i assert alias_index == i
@ -542,7 +584,8 @@ cdef class Writer:
def __init__(self, path): def __init__(self, path):
assert isinstance(path, Path) assert isinstance(path, Path)
content = bytes(path) content = bytes(path)
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content cdef bytes bytes_loc = content.encode('utf8') \
if type(content) == str else content
self._fp = fopen(<char*>bytes_loc, 'wb') self._fp = fopen(<char*>bytes_loc, 'wb')
if not self._fp: if not self._fp:
raise IOError(Errors.E146.format(path=path)) raise IOError(Errors.E146.format(path=path))
@ -552,14 +595,18 @@ cdef class Writer:
cdef size_t status = fclose(self._fp) cdef size_t status = fclose(self._fp)
assert status == 0 assert status == 0
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1: cdef int write_header(
self, int64_t nr_entries, int64_t entity_vector_length
) except -1:
self._write(&nr_entries, sizeof(nr_entries)) self._write(&nr_entries, sizeof(nr_entries))
self._write(&entity_vector_length, sizeof(entity_vector_length)) self._write(&entity_vector_length, sizeof(entity_vector_length))
cdef int write_vector_element(self, float element) except -1: cdef int write_vector_element(self, float element) except -1:
self._write(&element, sizeof(element)) self._write(&element, sizeof(element))
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1: cdef int write_entry(
self, hash_t entry_hash, float entry_freq, int32_t vector_index
) except -1:
self._write(&entry_hash, sizeof(entry_hash)) self._write(&entry_hash, sizeof(entry_hash))
self._write(&entry_freq, sizeof(entry_freq)) self._write(&entry_freq, sizeof(entry_freq))
self._write(&vector_index, sizeof(vector_index)) self._write(&vector_index, sizeof(vector_index))
@ -568,7 +615,9 @@ cdef class Writer:
cdef int write_alias_length(self, int64_t alias_length) except -1: cdef int write_alias_length(self, int64_t alias_length) except -1:
self._write(&alias_length, sizeof(alias_length)) self._write(&alias_length, sizeof(alias_length))
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1: cdef int write_alias_header(
self, hash_t alias_hash, int64_t candidate_length
) except -1:
self._write(&alias_hash, sizeof(alias_hash)) self._write(&alias_hash, sizeof(alias_hash))
self._write(&candidate_length, sizeof(candidate_length)) self._write(&candidate_length, sizeof(candidate_length))
@ -584,16 +633,19 @@ cdef class Writer:
cdef class Reader: cdef class Reader:
def __init__(self, path): def __init__(self, path):
content = bytes(path) content = bytes(path)
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content cdef bytes bytes_loc = content.encode('utf8') \
if type(content) == str else content
self._fp = fopen(<char*>bytes_loc, 'rb') self._fp = fopen(<char*>bytes_loc, 'rb')
if not self._fp: if not self._fp:
PyErr_SetFromErrno(IOError) PyErr_SetFromErrno(IOError)
status = fseek(self._fp, 0, 0) # this can be 0 if there is no header fseek(self._fp, 0, 0) # this can be 0 if there is no header
def __dealloc__(self): def __dealloc__(self):
fclose(self._fp) fclose(self._fp)
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1: cdef int read_header(
self, int64_t* nr_entries, int64_t* entity_vector_length
) except -1:
status = self._read(nr_entries, sizeof(int64_t)) status = self._read(nr_entries, sizeof(int64_t))
if status < 1: if status < 1:
if feof(self._fp): if feof(self._fp):
@ -613,7 +665,9 @@ cdef class Reader:
return 0 # end of file return 0 # end of file
raise IOError(Errors.E145.format(param="vector element")) raise IOError(Errors.E145.format(param="vector element"))
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1: cdef int read_entry(
self, hash_t* entity_hash, float* freq, int32_t* vector_index
) except -1:
status = self._read(entity_hash, sizeof(hash_t)) status = self._read(entity_hash, sizeof(hash_t))
if status < 1: if status < 1:
if feof(self._fp): if feof(self._fp):
@ -644,7 +698,9 @@ cdef class Reader:
return 0 # end of file return 0 # end of file
raise IOError(Errors.E145.format(param="alias length")) raise IOError(Errors.E145.format(param="alias length"))
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1: cdef int read_alias_header(
self, hash_t* alias_hash, int64_t* candidate_length
) except -1:
status = self._read(alias_hash, sizeof(hash_t)) status = self._read(alias_hash, sizeof(hash_t))
if status < 1: if status < 1:
if feof(self._fp): if feof(self._fp):

View File

@ -740,6 +740,11 @@ class Language:
) )
) )
pipe = source.get_pipe(source_name) pipe = source.get_pipe(source_name)
# There is no actual solution here. Either the component has the right
# name for the source pipeline or the component has the right name for
# the current pipeline. This prioritizes the current pipeline.
if hasattr(pipe, "name"):
pipe.name = name
# Make sure the source config is interpolated so we don't end up with # Make sure the source config is interpolated so we don't end up with
# orphaned variables in our final config # orphaned variables in our final config
source_config = source.config.interpolate() source_config = source.config.interpolate()
@ -817,6 +822,7 @@ class Language:
pipe_index = self._get_pipe_index(before, after, first, last) pipe_index = self._get_pipe_index(before, after, first, last)
self._pipe_meta[name] = self.get_factory_meta(factory_name) self._pipe_meta[name] = self.get_factory_meta(factory_name)
self._components.insert(pipe_index, (name, pipe_component)) self._components.insert(pipe_index, (name, pipe_component))
self._link_components()
return pipe_component return pipe_component
def _get_pipe_index( def _get_pipe_index(
@ -956,6 +962,7 @@ class Language:
if old_name in self._config["initialize"]["components"]: if old_name in self._config["initialize"]["components"]:
init_cfg = self._config["initialize"]["components"].pop(old_name) init_cfg = self._config["initialize"]["components"].pop(old_name)
self._config["initialize"]["components"][new_name] = init_cfg self._config["initialize"]["components"][new_name] = init_cfg
self._link_components()
def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]: def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]:
"""Remove a component from the pipeline. """Remove a component from the pipeline.
@ -979,6 +986,7 @@ class Language:
# Make sure the name is also removed from the set of disabled components # Make sure the name is also removed from the set of disabled components
if name in self.disabled: if name in self.disabled:
self._disabled.remove(name) self._disabled.remove(name)
self._link_components()
return removed return removed
def disable_pipe(self, name: str) -> None: def disable_pipe(self, name: str) -> None:
@ -1823,8 +1831,16 @@ class Language:
# The problem is we need to do it during deserialization...And the # The problem is we need to do it during deserialization...And the
# components don't receive the pipeline then. So this does have to be # components don't receive the pipeline then. So this does have to be
# here :( # here :(
# First, fix up all the internal component names in case they have
# gotten out of sync due to sourcing components from different
# pipelines, since find_listeners uses proc2.name for the listener
# map.
for name, proc in self.pipeline:
if hasattr(proc, "name"):
proc.name = name
for i, (name1, proc1) in enumerate(self.pipeline): for i, (name1, proc1) in enumerate(self.pipeline):
if isinstance(proc1, ty.ListenedToComponent): if isinstance(proc1, ty.ListenedToComponent):
proc1.listener_map = {}
for name2, proc2 in self.pipeline[i + 1 :]: for name2, proc2 in self.pipeline[i + 1 :]:
proc1.find_listeners(proc2) proc1.find_listeners(proc2)
@ -1934,7 +1950,6 @@ class Language:
# Later we replace the component config with the raw config again. # Later we replace the component config with the raw config again.
interpolated = filled.interpolate() if not filled.is_interpolated else filled interpolated = filled.interpolate() if not filled.is_interpolated else filled
pipeline = interpolated.get("components", {}) pipeline = interpolated.get("components", {})
sourced = util.get_sourced_components(interpolated)
# If components are loaded from a source (existing models), we cache # If components are loaded from a source (existing models), we cache
# them here so they're only loaded once # them here so they're only loaded once
source_nlps = {} source_nlps = {}
@ -1962,6 +1977,7 @@ class Language:
raw_config=raw_config, raw_config=raw_config,
) )
else: else:
assert "source" in pipe_cfg
# We need the sourced components to reference the same # We need the sourced components to reference the same
# vocab without modifying the current vocab state **AND** # vocab without modifying the current vocab state **AND**
# we still want to load the source model vectors to perform # we still want to load the source model vectors to perform
@ -1981,6 +1997,10 @@ class Language:
source_name = pipe_cfg.get("component", pipe_name) source_name = pipe_cfg.get("component", pipe_name)
listeners_replaced = False listeners_replaced = False
if "replace_listeners" in pipe_cfg: if "replace_listeners" in pipe_cfg:
# Make sure that the listened-to component has the
# state of the source pipeline listener map so that the
# replace_listeners method below works as intended.
source_nlps[model]._link_components()
for name, proc in source_nlps[model].pipeline: for name, proc in source_nlps[model].pipeline:
if source_name in getattr(proc, "listening_components", []): if source_name in getattr(proc, "listening_components", []):
source_nlps[model].replace_listeners( source_nlps[model].replace_listeners(
@ -1992,6 +2012,8 @@ class Language:
nlp.add_pipe( nlp.add_pipe(
source_name, source=source_nlps[model], name=pipe_name source_name, source=source_nlps[model], name=pipe_name
) )
# At this point after nlp.add_pipe, the listener map
# corresponds to the new pipeline.
if model not in source_nlp_vectors_hashes: if model not in source_nlp_vectors_hashes:
source_nlp_vectors_hashes[model] = hash( source_nlp_vectors_hashes[model] = hash(
source_nlps[model].vocab.vectors.to_bytes( source_nlps[model].vocab.vectors.to_bytes(
@ -2046,27 +2068,6 @@ class Language:
raise ValueError( raise ValueError(
Errors.E942.format(name="pipeline_creation", value=type(nlp)) Errors.E942.format(name="pipeline_creation", value=type(nlp))
) )
# Detect components with listeners that are not frozen consistently
for name, proc in nlp.pipeline:
if isinstance(proc, ty.ListenedToComponent):
# Remove listeners not in the pipeline
listener_names = proc.listening_components
unused_listener_names = [
ll for ll in listener_names if ll not in nlp.pipe_names
]
for listener_name in unused_listener_names:
for listener in proc.listener_map.get(listener_name, []):
proc.remove_listener(listener, listener_name)
for listener_name in proc.listening_components:
# e.g. tok2vec/transformer
# If it's a component sourced from another pipeline, we check if
# the tok2vec listeners should be replaced with standalone tok2vec
# models (e.g. so component can be frozen without its performance
# degrading when other components/tok2vec are updated)
paths = sourced.get(listener_name, {}).get("replace_listeners", [])
if paths:
nlp.replace_listeners(name, listener_name, paths)
return nlp return nlp
def replace_listeners( def replace_listeners(
@ -2081,7 +2082,7 @@ class Language:
useful when training a pipeline with components sourced from an existing useful when training a pipeline with components sourced from an existing
pipeline: if multiple components (e.g. tagger, parser, NER) listen to pipeline: if multiple components (e.g. tagger, parser, NER) listen to
the same tok2vec component, but some of them are frozen and not updated, the same tok2vec component, but some of them are frozen and not updated,
their performance may degrade significally as the tok2vec component is their performance may degrade significantly as the tok2vec component is
updated with new data. To prevent this, listeners can be replaced with updated with new data. To prevent this, listeners can be replaced with
a standalone tok2vec layer that is owned by the component and doesn't a standalone tok2vec layer that is owned by the component and doesn't
change if the component isn't updated. change if the component isn't updated.

View File

@ -1,7 +1,6 @@
# cython: embedsignature=True # cython: embedsignature=True
# Compiler crashes on memory view coercion without this. Should report bug. # Compiler crashes on memory view coercion without this. Should report bug.
cimport numpy as np cimport numpy as np
from cython.view cimport array as cvarray
from libc.string cimport memset from libc.string cimport memset
np.import_array() np.import_array()
@ -137,9 +136,11 @@ cdef class Lexeme:
if hasattr(other, "orth"): if hasattr(other, "orth"):
if self.c.orth == other.orth: if self.c.orth == other.orth:
return 1.0 return 1.0
elif hasattr(other, "__len__") and len(other) == 1 \ elif (
and hasattr(other[0], "orth"): hasattr(other, "__len__") and len(other) == 1
if self.c.orth == other[0].orth: and hasattr(other[0], "orth")
and self.c.orth == other[0].orth
):
return 1.0 return 1.0
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
warnings.warn(Warnings.W008.format(obj="Lexeme")) warnings.warn(Warnings.W008.format(obj="Lexeme"))

View File

@ -108,7 +108,7 @@ cdef class DependencyMatcher:
key (str): The match ID. key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID. RETURNS (bool): Whether the matcher contains rules for this match ID.
""" """
return self.has_key(key) return self.has_key(key) # no-cython-lint: W601
def _validate_input(self, pattern, key): def _validate_input(self, pattern, key):
idx = 0 idx = 0
@ -264,7 +264,7 @@ cdef class DependencyMatcher:
def remove(self, key): def remove(self, key):
key = self._normalize_key(key) key = self._normalize_key(key)
if not key in self._patterns: if key not in self._patterns:
raise ValueError(Errors.E175.format(key=key)) raise ValueError(Errors.E175.format(key=key))
self._patterns.pop(key) self._patterns.pop(key)
self._raw_patterns.pop(key) self._raw_patterns.pop(key)
@ -382,7 +382,7 @@ cdef class DependencyMatcher:
return [] return []
return [doc[node].head] return [doc[node].head]
def _gov(self,doc,node): def _gov(self, doc, node):
return list(doc[node].children) return list(doc[node].children)
def _dep_chain(self, doc, node): def _dep_chain(self, doc, node):

View File

@ -12,25 +12,13 @@ import warnings
import srsly import srsly
from ..attrs cimport ( from ..attrs cimport DEP, ENT_IOB, ID, LEMMA, MORPH, NULL_ATTR, POS, TAG
DEP,
ENT_IOB,
ID,
LEMMA,
MORPH,
NULL_ATTR,
ORTH,
POS,
TAG,
attr_id_t,
)
from ..structs cimport TokenC from ..structs cimport TokenC
from ..tokens.doc cimport Doc, get_token_attr_for_matcher from ..tokens.doc cimport Doc, get_token_attr_for_matcher
from ..tokens.morphanalysis cimport MorphAnalysis from ..tokens.morphanalysis cimport MorphAnalysis
from ..tokens.span cimport Span from ..tokens.span cimport Span
from ..tokens.token cimport Token from ..tokens.token cimport Token
from ..typedefs cimport attr_t from ..typedefs cimport attr_t
from ..vocab cimport Vocab
from ..errors import Errors, MatchPatternError, Warnings from ..errors import Errors, MatchPatternError, Warnings
from ..schemas import validate_token_pattern from ..schemas import validate_token_pattern
@ -42,7 +30,6 @@ from ..attrs import IDS
from ..errors import Errors, MatchPatternError, Warnings from ..errors import Errors, MatchPatternError, Warnings
from ..schemas import validate_token_pattern from ..schemas import validate_token_pattern
from ..strings import get_string_id from ..strings import get_string_id
from ..util import registry
from .levenshtein import levenshtein_compare from .levenshtein import levenshtein_compare
DEF PADDING = 5 DEF PADDING = 5
@ -93,9 +80,9 @@ cdef class Matcher:
key (str): The match ID. key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID. RETURNS (bool): Whether the matcher contains rules for this match ID.
""" """
return self.has_key(key) return self.has_key(key) # no-cython-lint: W601
def add(self, key, patterns, *, on_match=None, greedy: str=None): def add(self, key, patterns, *, on_match=None, greedy: str = None):
"""Add a match-rule to the matcher. A match-rule consists of: an ID """Add a match-rule to the matcher. A match-rule consists of: an ID
key, an on_match callback, and one or more patterns. key, an on_match callback, and one or more patterns.
@ -149,8 +136,13 @@ cdef class Matcher:
key = self._normalize_key(key) key = self._normalize_key(key)
for pattern in patterns: for pattern in patterns:
try: try:
specs = _preprocess_pattern(pattern, self.vocab, specs = _preprocess_pattern(
self._extensions, self._extra_predicates, self._fuzzy_compare) pattern,
self.vocab,
self._extensions,
self._extra_predicates,
self._fuzzy_compare
)
self.patterns.push_back(init_pattern(self.mem, key, specs)) self.patterns.push_back(init_pattern(self.mem, key, specs))
for spec in specs: for spec in specs:
for attr, _ in spec[1]: for attr, _ in spec[1]:
@ -174,7 +166,7 @@ cdef class Matcher:
key (str): The ID of the match rule. key (str): The ID of the match rule.
""" """
norm_key = self._normalize_key(key) norm_key = self._normalize_key(key)
if not norm_key in self._patterns: if norm_key not in self._patterns:
raise ValueError(Errors.E175.format(key=key)) raise ValueError(Errors.E175.format(key=key))
self._patterns.pop(norm_key) self._patterns.pop(norm_key)
self._callbacks.pop(norm_key) self._callbacks.pop(norm_key)
@ -274,8 +266,15 @@ cdef class Matcher:
if self.patterns.empty(): if self.patterns.empty():
matches = [] matches = []
else: else:
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, matches = find_matches(
extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments) &self.patterns[0],
self.patterns.size(),
doclike,
length,
extensions=self._extensions,
predicates=self._extra_predicates,
with_alignments=with_alignments
)
final_matches = [] final_matches = []
pairs_by_id = {} pairs_by_id = {}
# For each key, either add all matches, or only the filtered, # For each key, either add all matches, or only the filtered,
@ -373,7 +372,6 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
cdef vector[MatchC] matches cdef vector[MatchC] matches
cdef vector[vector[MatchAlignmentC]] align_states cdef vector[vector[MatchAlignmentC]] align_states
cdef vector[vector[MatchAlignmentC]] align_matches cdef vector[vector[MatchAlignmentC]] align_matches
cdef PatternStateC state
cdef int i, j, nr_extra_attr cdef int i, j, nr_extra_attr
cdef Pool mem = Pool() cdef Pool mem = Pool()
output = [] output = []
@ -395,14 +393,22 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
value = token.vocab.strings[value] value = token.vocab.strings[value]
extra_attr_values[i * nr_extra_attr + index] = value extra_attr_values[i * nr_extra_attr + index] = value
# Main loop # Main loop
cdef int nr_predicate = len(predicates)
for i in range(length): for i in range(length):
for j in range(n): for j in range(n):
states.push_back(PatternStateC(patterns[j], i, 0)) states.push_back(PatternStateC(patterns[j], i, 0))
if with_alignments != 0: if with_alignments != 0:
align_states.resize(states.size()) align_states.resize(states.size())
transition_states(states, matches, align_states, align_matches, predicate_cache, transition_states(
doclike[i], extra_attr_values, predicates, with_alignments) states,
matches,
align_states,
align_matches,
predicate_cache,
doclike[i],
extra_attr_values,
predicates,
with_alignments
)
extra_attr_values += nr_extra_attr extra_attr_values += nr_extra_attr
predicate_cache += len(predicates) predicate_cache += len(predicates)
# Handle matches that end in 0-width patterns # Handle matches that end in 0-width patterns
@ -428,18 +434,28 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
return output return output
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, cdef void transition_states(
vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches, vector[PatternStateC]& states,
vector[MatchC]& matches,
vector[vector[MatchAlignmentC]]& align_states,
vector[vector[MatchAlignmentC]]& align_matches,
int8_t* cached_py_predicates, int8_t* cached_py_predicates,
Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *: Token token,
const attr_t* extra_attrs,
py_predicates,
bint with_alignments
) except *:
cdef int q = 0 cdef int q = 0
cdef vector[PatternStateC] new_states cdef vector[PatternStateC] new_states
cdef vector[vector[MatchAlignmentC]] align_new_states cdef vector[vector[MatchAlignmentC]] align_new_states
cdef int nr_predicate = len(py_predicates)
for i in range(states.size()): for i in range(states.size()):
if states[i].pattern.nr_py >= 1: if states[i].pattern.nr_py >= 1:
update_predicate_cache(cached_py_predicates, update_predicate_cache(
states[i].pattern, token, py_predicates) cached_py_predicates,
states[i].pattern,
token,
py_predicates
)
action = get_action(states[i], token.c, extra_attrs, action = get_action(states[i], token.c, extra_attrs,
cached_py_predicates) cached_py_predicates)
if action == REJECT: if action == REJECT:
@ -475,8 +491,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
align_new_states.push_back(align_states[q]) align_new_states.push_back(align_states[q])
states[q].pattern += 1 states[q].pattern += 1
if states[q].pattern.nr_py != 0: if states[q].pattern.nr_py != 0:
update_predicate_cache(cached_py_predicates, update_predicate_cache(
states[q].pattern, token, py_predicates) cached_py_predicates,
states[q].pattern,
token,
py_predicates
)
action = get_action(states[q], token.c, extra_attrs, action = get_action(states[q], token.c, extra_attrs,
cached_py_predicates) cached_py_predicates)
# Update alignment before the transition of current state # Update alignment before the transition of current state
@ -492,8 +512,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
ent_id = get_ent_id(state.pattern) ent_id = get_ent_id(state.pattern)
if action == MATCH: if action == MATCH:
matches.push_back( matches.push_back(
MatchC(pattern_id=ent_id, start=state.start, MatchC(
length=state.length+1)) pattern_id=ent_id,
start=state.start,
length=state.length+1
)
)
# `align_matches` always corresponds to `matches` 1:1 # `align_matches` always corresponds to `matches` 1:1
if with_alignments != 0: if with_alignments != 0:
align_matches.push_back(align_states[q]) align_matches.push_back(align_states[q])
@ -501,23 +525,35 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
# push match without last token if length > 0 # push match without last token if length > 0
if state.length > 0: if state.length > 0:
matches.push_back( matches.push_back(
MatchC(pattern_id=ent_id, start=state.start, MatchC(
length=state.length)) pattern_id=ent_id,
start=state.start,
length=state.length
)
)
# MATCH_DOUBLE emits matches twice, # MATCH_DOUBLE emits matches twice,
# add one more to align_matches in order to keep 1:1 relationship # add one more to align_matches in order to keep 1:1 relationship
if with_alignments != 0: if with_alignments != 0:
align_matches.push_back(align_states[q]) align_matches.push_back(align_states[q])
# push match with last token # push match with last token
matches.push_back( matches.push_back(
MatchC(pattern_id=ent_id, start=state.start, MatchC(
length=state.length+1)) pattern_id=ent_id,
start=state.start,
length=state.length + 1
)
)
# `align_matches` always corresponds to `matches` 1:1 # `align_matches` always corresponds to `matches` 1:1
if with_alignments != 0: if with_alignments != 0:
align_matches.push_back(align_states[q]) align_matches.push_back(align_states[q])
elif action == MATCH_REJECT: elif action == MATCH_REJECT:
matches.push_back( matches.push_back(
MatchC(pattern_id=ent_id, start=state.start, MatchC(
length=state.length)) pattern_id=ent_id,
start=state.start,
length=state.length
)
)
# `align_matches` always corresponds to `matches` 1:1 # `align_matches` always corresponds to `matches` 1:1
if with_alignments != 0: if with_alignments != 0:
align_matches.push_back(align_states[q]) align_matches.push_back(align_states[q])
@ -540,8 +576,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
align_states.push_back(align_new_states[i]) align_states.push_back(align_new_states[i])
cdef int update_predicate_cache(int8_t* cache, cdef int update_predicate_cache(
const TokenPatternC* pattern, Token token, predicates) except -1: int8_t* cache,
const TokenPatternC* pattern,
Token token,
predicates
) except -1:
# If the state references any extra predicates, check whether they match. # If the state references any extra predicates, check whether they match.
# These are cached, so that we don't call these potentially expensive # These are cached, so that we don't call these potentially expensive
# Python functions more than we need to. # Python functions more than we need to.
@ -587,10 +627,12 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
else: else:
state.pattern += 1 state.pattern += 1
cdef action_t get_action(
cdef action_t get_action(PatternStateC state, PatternStateC state,
const TokenC* token, const attr_t* extra_attrs, const TokenC * token,
const int8_t* predicate_matches) nogil: const attr_t * extra_attrs,
const int8_t * predicate_matches
) nogil:
"""We need to consider: """We need to consider:
a) Does the token match the specification? [Yes, No] a) Does the token match the specification? [Yes, No]
b) What's the quantifier? [1, 0+, ?] b) What's the quantifier? [1, 0+, ?]
@ -700,9 +742,12 @@ cdef action_t get_action(PatternStateC state,
return RETRY return RETRY
cdef int8_t get_is_match(PatternStateC state, cdef int8_t get_is_match(
const TokenC* token, const attr_t* extra_attrs, PatternStateC state,
const int8_t* predicate_matches) nogil: const TokenC* token,
const attr_t* extra_attrs,
const int8_t* predicate_matches
) nogil:
for i in range(state.pattern.nr_py): for i in range(state.pattern.nr_py):
if predicate_matches[state.pattern.py_predicates[i]] == -1: if predicate_matches[state.pattern.py_predicates[i]] == -1:
return 0 return 0
@ -1108,8 +1153,9 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
return output return output
def _get_extension_extra_predicates(spec, extra_predicates, predicate_types, def _get_extension_extra_predicates(
seen_predicates): spec, extra_predicates, predicate_types, seen_predicates
):
output = [] output = []
for attr, value in spec.items(): for attr, value in spec.items():
if isinstance(value, dict): if isinstance(value, dict):
@ -1138,7 +1184,7 @@ def _get_operators(spec):
return (ONE,) return (ONE,)
elif spec["OP"] in lookup: elif spec["OP"] in lookup:
return lookup[spec["OP"]] return lookup[spec["OP"]]
#Min_max {n,m} # Min_max {n,m}
elif spec["OP"].startswith("{") and spec["OP"].endswith("}"): elif spec["OP"].startswith("{") and spec["OP"].endswith("}"):
# {n} --> {n,n} exactly n ONE,(n) # {n} --> {n,n} exactly n ONE,(n)
# {n,m}--> {n,m} min of n, max of m ONE,(n),ZERO_ONE,(m) # {n,m}--> {n,m} min of n, max of m ONE,(n),ZERO_ONE,(m)
@ -1149,8 +1195,8 @@ def _get_operators(spec):
min_max = min_max if "," in min_max else f"{min_max},{min_max}" min_max = min_max if "," in min_max else f"{min_max},{min_max}"
n, m = min_max.split(",") n, m = min_max.split(",")
#1. Either n or m is a blank string and the other is numeric -->isdigit # 1. Either n or m is a blank string and the other is numeric -->isdigit
#2. Both are numeric and n <= m # 2. Both are numeric and n <= m
if (not n.isdecimal() and not m.isdecimal()) or (n.isdecimal() and m.isdecimal() and int(n) > int(m)): if (not n.isdecimal() and not m.isdecimal()) or (n.isdecimal() and m.isdecimal() and int(n) > int(m)):
keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m " keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys)) raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))

View File

@ -2,16 +2,14 @@
from collections import defaultdict from collections import defaultdict
from typing import List from typing import List
from libc.stdint cimport uintptr_t
from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
import warnings import warnings
from ..attrs cimport DEP, LEMMA, MORPH, ORTH, POS, TAG from ..attrs cimport DEP, LEMMA, MORPH, POS, TAG
from ..attrs import IDS from ..attrs import IDS
from ..structs cimport TokenC
from ..tokens.span cimport Span from ..tokens.span cimport Span
from ..tokens.token cimport Token from ..tokens.token cimport Token
from ..typedefs cimport attr_t from ..typedefs cimport attr_t

View File

@ -1,3 +1,4 @@
import warnings
from typing import Callable, List, Optional, Sequence, Tuple, cast from typing import Callable, List, Optional, Sequence, Tuple, cast
from thinc.api import Model, Ops, registry from thinc.api import Model, Ops, registry
@ -5,7 +6,8 @@ from thinc.initializers import glorot_uniform_init
from thinc.types import Floats1d, Floats2d, Ints1d, Ragged from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
from thinc.util import partial from thinc.util import partial
from ..errors import Errors from ..attrs import ORTH
from ..errors import Errors, Warnings
from ..tokens import Doc from ..tokens import Doc
from ..vectors import Mode from ..vectors import Mode
from ..vocab import Vocab from ..vocab import Vocab
@ -24,6 +26,8 @@ def StaticVectors(
linear projection to control the dimensionality. If a dropout rate is linear projection to control the dimensionality. If a dropout rate is
specified, the dropout is applied per dimension over the whole batch. specified, the dropout is applied per dimension over the whole batch.
""" """
if key_attr != "ORTH":
warnings.warn(Warnings.W125, DeprecationWarning)
return Model( return Model(
"static_vectors", "static_vectors",
forward, forward,
@ -40,9 +44,9 @@ def forward(
token_count = sum(len(doc) for doc in docs) token_count = sum(len(doc) for doc in docs)
if not token_count: if not token_count:
return _handle_empty(model.ops, model.get_dim("nO")) return _handle_empty(model.ops, model.get_dim("nO"))
key_attr: int = model.attrs["key_attr"]
keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
vocab: Vocab = docs[0].vocab vocab: Vocab = docs[0].vocab
key_attr: int = getattr(vocab.vectors, "attr", ORTH)
keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
W = cast(Floats2d, model.ops.as_contig(model.get_param("W"))) W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
if vocab.vectors.mode == Mode.default: if vocab.vectors.mode == Mode.default:
V = model.ops.asarray(vocab.vectors.data) V = model.ops.asarray(vocab.vectors.data)

View File

@ -246,6 +246,7 @@ cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph,
n_results += 1 n_results += 1
return n_results return n_results
def unpickle_morphology(strings, tags): def unpickle_morphology(strings, tags):
cdef Morphology morphology = Morphology(strings) cdef Morphology morphology = Morphology(strings)
for tag in tags: for tag in tags:

View File

@ -46,11 +46,18 @@ cdef struct EditTreeC:
bint is_match_node bint is_match_node
NodeC inner NodeC inner
cdef inline EditTreeC edittree_new_match(len_t prefix_len, len_t suffix_len, cdef inline EditTreeC edittree_new_match(
uint32_t prefix_tree, uint32_t suffix_tree): len_t prefix_len,
cdef MatchNodeC match_node = MatchNodeC(prefix_len=prefix_len, len_t suffix_len,
suffix_len=suffix_len, prefix_tree=prefix_tree, uint32_t prefix_tree,
suffix_tree=suffix_tree) uint32_t suffix_tree
):
cdef MatchNodeC match_node = MatchNodeC(
prefix_len=prefix_len,
suffix_len=suffix_len,
prefix_tree=prefix_tree,
suffix_tree=suffix_tree
)
cdef NodeC inner = NodeC(match_node=match_node) cdef NodeC inner = NodeC(match_node=match_node)
return EditTreeC(is_match_node=True, inner=inner) return EditTreeC(is_match_node=True, inner=inner)

View File

@ -5,8 +5,6 @@ from libc.string cimport memset
from libcpp.pair cimport pair from libcpp.pair cimport pair
from libcpp.vector cimport vector from libcpp.vector cimport vector
from pathlib import Path
from ...typedefs cimport hash_t from ...typedefs cimport hash_t
from ... import util from ... import util
@ -25,17 +23,16 @@ cdef LCS find_lcs(str source, str target):
target (str): The second string. target (str): The second string.
RETURNS (LCS): The spans of the longest common subsequences. RETURNS (LCS): The spans of the longest common subsequences.
""" """
cdef Py_ssize_t source_len = len(source)
cdef Py_ssize_t target_len = len(target) cdef Py_ssize_t target_len = len(target)
cdef size_t longest_align = 0; cdef size_t longest_align = 0
cdef int source_idx, target_idx cdef int source_idx, target_idx
cdef LCS lcs cdef LCS lcs
cdef Py_UCS4 source_cp, target_cp cdef Py_UCS4 source_cp, target_cp
memset(&lcs, 0, sizeof(lcs)) memset(&lcs, 0, sizeof(lcs))
cdef vector[size_t] prev_aligns = vector[size_t](target_len); cdef vector[size_t] prev_aligns = vector[size_t](target_len)
cdef vector[size_t] cur_aligns = vector[size_t](target_len); cdef vector[size_t] cur_aligns = vector[size_t](target_len)
for (source_idx, source_cp) in enumerate(source): for (source_idx, source_cp) in enumerate(source):
for (target_idx, target_cp) in enumerate(target): for (target_idx, target_cp) in enumerate(target):
@ -89,7 +86,7 @@ cdef class EditTrees:
cdef LCS lcs = find_lcs(form, lemma) cdef LCS lcs = find_lcs(form, lemma)
cdef EditTreeC tree cdef EditTreeC tree
cdef uint32_t tree_id, prefix_tree, suffix_tree cdef uint32_t prefix_tree, suffix_tree
if lcs_is_empty(lcs): if lcs_is_empty(lcs):
tree = edittree_new_subst(self.strings.add(form), self.strings.add(lemma)) tree = edittree_new_subst(self.strings.add(form), self.strings.add(lemma))
else: else:
@ -289,6 +286,7 @@ def _tree2dict(tree):
tree = tree["inner"]["subst_node"] tree = tree["inner"]["subst_node"]
return(dict(tree)) return(dict(tree))
def _dict2tree(tree): def _dict2tree(tree):
errors = validate_edit_tree(tree) errors = validate_edit_tree(tree)
if errors: if errors:

View File

@ -1,12 +1,14 @@
# cython: infer_types=True # cython: infer_types=True
# cython: profile=True # cython: profile=True
cimport numpy as np
import numpy import numpy
from cpython.ref cimport Py_XDECREF, PyObject from thinc.extra.search cimport Beam
from ...typedefs cimport class_t, hash_t from thinc.extra.search import MaxViolation
from thinc.extra.search cimport MaxViolation
from ...typedefs cimport class_t
from .transition_system cimport Transition, TransitionSystem from .transition_system cimport Transition, TransitionSystem
from ...errors import Errors from ...errors import Errors
@ -146,7 +148,6 @@ def update_beam(TransitionSystem moves, states, golds, model, int width, beam_de
cdef MaxViolation violn cdef MaxViolation violn
pbeam = BeamBatch(moves, states, golds, width=width, density=beam_density) pbeam = BeamBatch(moves, states, golds, width=width, density=beam_density)
gbeam = BeamBatch(moves, states, golds, width=width, density=0.0) gbeam = BeamBatch(moves, states, golds, width=width, density=0.0)
cdef StateClass state
beam_maps = [] beam_maps = []
backprops = [] backprops = []
violns = [MaxViolation() for _ in range(len(states))] violns = [MaxViolation() for _ in range(len(states))]

View File

@ -280,7 +280,6 @@ cdef cppclass StateC:
return n return n
int n_L(int head) nogil const: int n_L(int head) nogil const:
return n_arcs(this._left_arcs, head) return n_arcs(this._left_arcs, head)

View File

@ -9,7 +9,7 @@ from ...strings cimport hash_string
from ...structs cimport TokenC from ...structs cimport TokenC
from ...tokens.doc cimport Doc, set_children_from_heads from ...tokens.doc cimport Doc, set_children_from_heads
from ...tokens.token cimport MISSING_DEP from ...tokens.token cimport MISSING_DEP
from ...typedefs cimport attr_t, hash_t from ...typedefs cimport attr_t
from ...training import split_bilu_label from ...training import split_bilu_label
@ -68,8 +68,9 @@ cdef struct GoldParseStateC:
weight_t pop_cost weight_t pop_cost
cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state, cdef GoldParseStateC create_gold_state(
heads, labels, sent_starts) except *: Pool mem, const StateC* state, heads, labels, sent_starts
) except *:
cdef GoldParseStateC gs cdef GoldParseStateC gs
gs.length = len(heads) gs.length = len(heads)
gs.stride = 1 gs.stride = 1
@ -82,7 +83,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state,
gs.n_kids_in_stack = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0])) gs.n_kids_in_stack = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0]))
for i, is_sent_start in enumerate(sent_starts): for i, is_sent_start in enumerate(sent_starts):
if is_sent_start == True: if is_sent_start is True:
gs.state_bits[i] = set_state_flag( gs.state_bits[i] = set_state_flag(
gs.state_bits[i], gs.state_bits[i],
IS_SENT_START, IS_SENT_START,
@ -210,6 +211,7 @@ cdef class ArcEagerGold:
def update(self, StateClass stcls): def update(self, StateClass stcls):
update_gold_state(&self.c, stcls.c) update_gold_state(&self.c, stcls.c)
def _get_aligned_sent_starts(example): def _get_aligned_sent_starts(example):
"""Get list of SENT_START attributes aligned to the predicted tokenization. """Get list of SENT_START attributes aligned to the predicted tokenization.
If the reference has not sentence starts, return a list of None values. If the reference has not sentence starts, return a list of None values.
@ -524,7 +526,6 @@ cdef class Break:
""" """
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int i
if st.buffer_length() < 2: if st.buffer_length() < 2:
return False return False
elif st.B(1) != st.B(0) + 1: elif st.B(1) != st.B(0) + 1:
@ -556,8 +557,8 @@ cdef class Break:
cost -= 1 cost -= 1
if gold.heads[si] == b0: if gold.heads[si] == b0:
cost -= 1 cost -= 1
if not is_sent_start(gold, state.B(1)) \ if not is_sent_start(gold, state.B(1)) and\
and not is_sent_start_unknown(gold, state.B(1)): not is_sent_start_unknown(gold, state.B(1)):
cost += 1 cost += 1
return cost return cost
@ -805,7 +806,6 @@ cdef class ArcEager(TransitionSystem):
raise TypeError(Errors.E909.format(name="ArcEagerGold")) raise TypeError(Errors.E909.format(name="ArcEagerGold"))
cdef ArcEagerGold gold_ = gold cdef ArcEagerGold gold_ = gold
gold_state = gold_.c gold_state = gold_.c
n_gold = 0
if self.c[i].is_valid(stcls.c, self.c[i].label): if self.c[i].is_valid(stcls.c, self.c[i].label):
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label) cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
else: else:
@ -878,7 +878,7 @@ cdef class ArcEager(TransitionSystem):
print("Gold") print("Gold")
for token in example.y: for token in example.y:
print(token.i, token.text, token.dep_, token.head.text) print(token.i, token.text, token.dep_, token.head.text)
aligned_heads, aligned_labels = example.get_aligned_parse() aligned_heads, _aligned_labels = example.get_aligned_parse()
print("Aligned heads") print("Aligned heads")
for i, head in enumerate(aligned_heads): for i, head in enumerate(aligned_heads):
print(example.x[i], example.x[head] if head is not None else "__") print(example.x[i], example.x[head] if head is not None else "__")

View File

@ -1,6 +1,3 @@
import os
import random
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from libc.stdint cimport int32_t from libc.stdint cimport int32_t
from libcpp.memory cimport shared_ptr from libcpp.memory cimport shared_ptr
@ -14,7 +11,7 @@ from ...tokens.span import Span
from ...attrs cimport IS_SPACE from ...attrs cimport IS_SPACE
from ...lexeme cimport Lexeme from ...lexeme cimport Lexeme
from ...structs cimport SpanC, TokenC from ...structs cimport SpanC
from ...tokens.span cimport Span from ...tokens.span cimport Span
from ...typedefs cimport attr_t, weight_t from ...typedefs cimport attr_t, weight_t
@ -142,7 +139,6 @@ cdef class BiluoPushDown(TransitionSystem):
for entity_type in kwargs.get('entity_types', []): for entity_type in kwargs.get('entity_types', []):
for action in (BEGIN, IN, LAST, UNIT): for action in (BEGIN, IN, LAST, UNIT):
actions[action][entity_type] = 1 actions[action][entity_type] = 1
moves = ('M', 'B', 'I', 'L', 'U')
for example in kwargs.get('examples', []): for example in kwargs.get('examples', []):
for token in example.y: for token in example.y:
ent_type = token.ent_type_ ent_type = token.ent_type_
@ -324,7 +320,6 @@ cdef class BiluoPushDown(TransitionSystem):
raise TypeError(Errors.E909.format(name="BiluoGold")) raise TypeError(Errors.E909.format(name="BiluoGold"))
cdef BiluoGold gold_ = gold cdef BiluoGold gold_ = gold
gold_state = gold_.c gold_state = gold_.c
n_gold = 0
if self.c[i].is_valid(stcls.c, self.c[i].label): if self.c[i].is_valid(stcls.c, self.c[i].label):
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label) cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
else: else:
@ -487,10 +482,8 @@ cdef class In:
@staticmethod @staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil: cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold gold = <GoldNERStateC*>_gold
move = IN
cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT
cdef int g_act = gold.ner[s.B(0)].move cdef int g_act = gold.ner[s.B(0)].move
cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef bint is_sunk = _entity_is_sunk(s, gold.ner) cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
if g_act == MISSING: if g_act == MISSING:
@ -550,12 +543,10 @@ cdef class Last:
@staticmethod @staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil: cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold gold = <GoldNERStateC*>_gold
move = LAST
b0 = s.B(0) b0 = s.B(0)
ent_start = s.E(0) ent_start = s.E(0)
cdef int g_act = gold.ner[b0].move cdef int g_act = gold.ner[b0].move
cdef attr_t g_tag = gold.ner[b0].label
cdef int cost = 0 cdef int cost = 0
@ -655,7 +646,6 @@ cdef class Unit:
return cost return cost
cdef class Out: cdef class Out:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
@ -678,7 +668,6 @@ cdef class Out:
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil: cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold gold = <GoldNERStateC*>_gold
cdef int g_act = gold.ner[s.B(0)].move cdef int g_act = gold.ner[s.B(0)].move
cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef weight_t cost = 0 cdef weight_t cost = 0
if g_act == MISSING: if g_act == MISSING:
pass pass

View File

@ -125,14 +125,17 @@ def decompose(label):
def is_decorated(label): def is_decorated(label):
return DELIMITER in label return DELIMITER in label
def count_decorated_labels(gold_data): def count_decorated_labels(gold_data):
freqs = {} freqs = {}
for example in gold_data: for example in gold_data:
proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"), proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
example.get_aligned("DEP")) example.get_aligned("DEP"))
# set the label to ROOT for each root dependent # set the label to ROOT for each root dependent
deco_deps = ['ROOT' if head == i else deco_deps[i] deco_deps = [
for i, head in enumerate(proj_heads)] 'ROOT' if head == i else deco_deps[i]
for i, head in enumerate(proj_heads)
]
# count label frequencies # count label frequencies
for label in deco_deps: for label in deco_deps:
if is_decorated(label): if is_decorated(label):
@ -160,9 +163,9 @@ def projectivize(heads, labels):
cdef vector[int] _heads_to_c(heads): cdef vector[int] _heads_to_c(heads):
cdef vector[int] c_heads; cdef vector[int] c_heads
for head in heads: for head in heads:
if head == None: if head is None:
c_heads.push_back(-1) c_heads.push_back(-1)
else: else:
assert head < len(heads) assert head < len(heads)
@ -199,6 +202,7 @@ def _decorate(heads, proj_heads, labels):
deco_labels.append(labels[tokenid]) deco_labels.append(labels[tokenid])
return deco_labels return deco_labels
def get_smallest_nonproj_arc_slow(heads): def get_smallest_nonproj_arc_slow(heads):
cdef vector[int] c_heads = _heads_to_c(heads) cdef vector[int] c_heads = _heads_to_c(heads)
return _get_smallest_nonproj_arc(c_heads) return _get_smallest_nonproj_arc(c_heads)

View File

@ -1,6 +1,4 @@
# cython: infer_types=True # cython: infer_types=True
import numpy
from libcpp.vector cimport vector from libcpp.vector cimport vector
from ...tokens.doc cimport Doc from ...tokens.doc cimport Doc
@ -42,11 +40,11 @@ cdef class StateClass:
cdef vector[ArcC] arcs cdef vector[ArcC] arcs
self.c.get_arcs(&arcs) self.c.get_arcs(&arcs)
return list(arcs) return list(arcs)
#py_arcs = [] # py_arcs = []
#for arc in arcs: # for arc in arcs:
# if arc.head != -1 and arc.child != -1: # if arc.head != -1 and arc.child != -1:
# py_arcs.append((arc.head, arc.child, arc.label)) # py_arcs.append((arc.head, arc.child, arc.label))
#return arcs # return arcs
def add_arc(self, int head, int child, int label): def add_arc(self, int head, int child, int label):
self.c.add_arc(head, child, label) self.c.add_arc(head, child, label)
@ -138,7 +136,7 @@ cdef class StateClass:
def at_break(self): def at_break(self):
return False return False
#return self.c.at_break() # return self.c.at_break()
def has_head(self, int i): def has_head(self, int i):
return self.c.has_head(i) return self.c.has_head(i)

View File

@ -20,11 +20,15 @@ cdef struct Transition:
int (*do)(StateC* state, attr_t label) nogil int (*do)(StateC* state, attr_t label) nogil
ctypedef weight_t (*get_cost_func_t)(const StateC* state, const void* gold, ctypedef weight_t (*get_cost_func_t)(
attr_tlabel) nogil const StateC* state, const void* gold, attr_tlabel
ctypedef weight_t (*move_cost_func_t)(const StateC* state, const void* gold) nogil ) nogil
ctypedef weight_t (*label_cost_func_t)(const StateC* state, const void* ctypedef weight_t (*move_cost_func_t)(
gold, attr_t label) nogil const StateC* state, const void* gold
) nogil
ctypedef weight_t (*label_cost_func_t)(
const StateC* state, const void* gold, attr_t label
) nogil
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil

View File

@ -10,9 +10,7 @@ from collections import Counter
import srsly import srsly
from ...structs cimport TokenC from ...structs cimport TokenC
from ...tokens.doc cimport Doc
from ...typedefs cimport attr_t, weight_t from ...typedefs cimport attr_t, weight_t
from . cimport _beam_utils
from ._parser_utils cimport arg_max_if_valid from ._parser_utils cimport arg_max_if_valid
from .stateclass cimport StateClass from .stateclass cimport StateClass
@ -270,7 +268,6 @@ cdef class TransitionSystem:
return self return self
def to_bytes(self, exclude=tuple()): def to_bytes(self, exclude=tuple()):
transitions = []
serializers = { serializers = {
'moves': lambda: srsly.json_dumps(self.labels), 'moves': lambda: srsly.json_dumps(self.labels),
'strings': lambda: self.strings.to_bytes(), 'strings': lambda: self.strings.to_bytes(),

View File

@ -1,6 +1,6 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from collections import defaultdict from collections import defaultdict
from typing import Callable, Iterable, Optional from typing import Callable, Optional
from thinc.api import Config, Model from thinc.api import Config, Model

View File

@ -5,7 +5,6 @@ from typing import Callable, Dict, Iterable, List, Optional, Union
import srsly import srsly
from thinc.api import Config, Model from thinc.api import Config, Model
from thinc.legacy import LegacySequenceCategoricalCrossentropy from thinc.legacy import LegacySequenceCategoricalCrossentropy
from thinc.types import Floats2d, Ints1d
from ..morphology cimport Morphology from ..morphology cimport Morphology
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
@ -16,11 +15,9 @@ from ..errors import Errors
from ..language import Language from ..language import Language
from ..parts_of_speech import IDS as POS_IDS from ..parts_of_speech import IDS as POS_IDS
from ..scorer import Scorer from ..scorer import Scorer
from ..symbols import POS
from ..training import validate_examples, validate_get_examples from ..training import validate_examples, validate_get_examples
from ..util import registry from ..util import registry
from .pipe import deserialize_config from .tagger import Tagger
from .tagger import ActivationsT, Tagger
# See #9050 # See #9050
BACKWARD_OVERWRITE = True BACKWARD_OVERWRITE = True
@ -86,8 +83,11 @@ def morphologizer_score(examples, **kwargs):
results = {} results = {}
results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)) results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
results.update(Scorer.score_token_attr_per_feat(examples, results.update(
"morph", getter=morph_key_getter, **kwargs)) Scorer.score_token_attr_per_feat(
examples, "morph", getter=morph_key_getter, **kwargs
)
)
return results return results
@ -249,7 +249,6 @@ class Morphologizer(Tagger):
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
cdef Doc doc cdef Doc doc
cdef Vocab vocab = self.vocab
cdef bint overwrite = self.cfg["overwrite"] cdef bint overwrite = self.cfg["overwrite"]
cdef bint extend = self.cfg["extend"] cdef bint extend = self.cfg["extend"]

View File

@ -1,12 +1,12 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from collections import defaultdict from collections import defaultdict
from typing import Callable, Iterable, Optional from typing import Callable, Optional
from thinc.api import Config, Model from thinc.api import Config, Model
from ..language import Language from ..language import Language
from ..scorer import PRFScore, get_ner_prf from ..scorer import get_ner_prf
from ..training import remove_bilu_prefix, validate_examples from ..training import remove_bilu_prefix
from ..util import registry from ..util import registry
from ._parser_internals.ner import BiluoPushDown from ._parser_internals.ner import BiluoPushDown
from ._parser_internals.transition_system import TransitionSystem from ._parser_internals.transition_system import TransitionSystem

View File

@ -1,6 +1,6 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
import warnings import warnings
from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple, Union from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
import srsly import srsly
@ -33,7 +33,7 @@ cdef class Pipe:
""" """
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="__call__", name=self.name)) raise NotImplementedError(Errors.E931.format(parent="Pipe", method="__call__", name=self.name))
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]: def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under """Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are the hood when the nlp object is called on a text and all components are
applied to the Doc. applied to the Doc.
@ -52,7 +52,7 @@ cdef class Pipe:
except Exception as e: except Exception as e:
error_handler(self.name, self, [doc], e) error_handler(self.name, self, [doc], e)
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None): def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language = None):
"""Initialize the pipe. For non-trainable components, this method """Initialize the pipe. For non-trainable components, this method
is optional. For trainable components, which should inherit is optional. For trainable components, which should inherit
from the subclass TrainablePipe, the provided data examples from the subclass TrainablePipe, the provided data examples

View File

@ -7,11 +7,11 @@ from ..tokens.doc cimport Doc
from .. import util from .. import util
from ..language import Language from ..language import Language
from ..scorer import Scorer
from .pipe import Pipe from .pipe import Pipe
from .senter import senter_score from .senter import senter_score
@Language.factory( @Language.factory(
"sentencizer", "sentencizer",
assigns=["token.is_sent_start", "doc.sents"], assigns=["token.is_sent_start", "doc.sents"],
@ -34,7 +34,8 @@ class Sentencizer(Pipe):
DOCS: https://spacy.io/api/sentencizer DOCS: https://spacy.io/api/sentencizer
""" """
default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', default_punct_chars = [
'!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '᱿', '', '', '', '', '', '', '', '', '', '', '', '', '᱿',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
@ -44,7 +45,8 @@ class Sentencizer(Pipe):
'𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
'𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
'', ''] '', ''
]
def __init__( def __init__(
self, self,
@ -127,7 +129,6 @@ class Sentencizer(Pipe):
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
cdef Doc doc cdef Doc doc
cdef int idx = 0
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i] doc_tag_ids = batch_tag_ids[i]
for j, tag_id in enumerate(doc_tag_ids): for j, tag_id in enumerate(doc_tag_ids):
@ -168,7 +169,6 @@ class Sentencizer(Pipe):
path = path.with_suffix(".json") path = path.with_suffix(".json")
srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite}) srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
def from_disk(self, path, *, exclude=tuple()): def from_disk(self, path, *, exclude=tuple()):
"""Load the sentencizer from disk. """Load the sentencizer from disk.

View File

@ -1,11 +1,9 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from itertools import islice from itertools import islice
from typing import Callable, Dict, Iterable, List, Optional, Union from typing import Callable, Iterable, Optional
import srsly
from thinc.api import Config, Model from thinc.api import Config, Model
from thinc.legacy import LegacySequenceCategoricalCrossentropy from thinc.legacy import LegacySequenceCategoricalCrossentropy
from thinc.types import Floats2d, Ints1d
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc

View File

@ -48,14 +48,14 @@ DEFAULT_SPAN_FINDER_MODEL = Config().from_str(span_finder_default_config)["model
"threshold": 0.5, "threshold": 0.5,
"model": DEFAULT_SPAN_FINDER_MODEL, "model": DEFAULT_SPAN_FINDER_MODEL,
"spans_key": DEFAULT_SPANS_KEY, "spans_key": DEFAULT_SPANS_KEY,
"max_length": None, "max_length": 25,
"min_length": None, "min_length": None,
"scorer": {"@scorers": "spacy.span_finder_scorer.v1"}, "scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
}, },
default_score_weights={ default_score_weights={
f"span_finder_{DEFAULT_SPANS_KEY}_f": 1.0, f"spans_{DEFAULT_SPANS_KEY}_f": 1.0,
f"span_finder_{DEFAULT_SPANS_KEY}_p": 0.0, f"spans_{DEFAULT_SPANS_KEY}_p": 0.0,
f"span_finder_{DEFAULT_SPANS_KEY}_r": 0.0, f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
}, },
) )
def make_span_finder( def make_span_finder(
@ -104,7 +104,7 @@ def make_span_finder_scorer():
def span_finder_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: def span_finder_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
kwargs = dict(kwargs) kwargs = dict(kwargs)
attr_prefix = "span_finder_" attr_prefix = "spans_"
key = kwargs["spans_key"] key = kwargs["spans_key"]
kwargs.setdefault("attr", f"{attr_prefix}{key}") kwargs.setdefault("attr", f"{attr_prefix}{key}")
kwargs.setdefault( kwargs.setdefault(

View File

@ -1,27 +1,20 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
import warnings
from itertools import islice from itertools import islice
from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
import numpy import numpy
import srsly
from thinc.api import Config, Model, set_dropout_rate from thinc.api import Config, Model, set_dropout_rate
from thinc.legacy import LegacySequenceCategoricalCrossentropy from thinc.legacy import LegacySequenceCategoricalCrossentropy
from thinc.types import Floats2d, Ints1d from thinc.types import Floats2d, Ints1d
from ..morphology cimport Morphology
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
from .. import util from .. import util
from ..attrs import ID, POS from ..errors import Errors
from ..errors import Errors, Warnings
from ..language import Language from ..language import Language
from ..parts_of_speech import X
from ..scorer import Scorer from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples from ..training import validate_examples, validate_get_examples
from ..util import registry from ..util import registry
from .pipe import deserialize_config
from .trainable_pipe import TrainablePipe from .trainable_pipe import TrainablePipe
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]] ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
@ -188,7 +181,6 @@ class Tagger(TrainablePipe):
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
cdef Doc doc cdef Doc doc
cdef Vocab vocab = self.vocab
cdef bint overwrite = self.cfg["overwrite"] cdef bint overwrite = self.cfg["overwrite"]
labels = self.labels labels = self.labels
for i, doc in enumerate(docs): for i, doc in enumerate(docs):

View File

@ -103,7 +103,7 @@ cdef class TrainablePipe(Pipe):
losses[self.name] += loss losses[self.name] += loss
return losses return losses
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]: def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under """Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are the hood when the nlp object is called on a text and all components are
applied to the Doc. applied to the Doc.
@ -150,9 +150,9 @@ cdef class TrainablePipe(Pipe):
def update(self, def update(self,
examples: Iterable["Example"], examples: Iterable["Example"],
*, *,
drop: float=0.0, drop: float = 0.0,
sgd: Optimizer=None, sgd: Optimizer = None,
losses: Optional[Dict[str, float]]=None) -> Dict[str, float]: losses: Optional[Dict[str, float]] = None) -> Dict[str, float]:
"""Learn from a batch of documents and gold-standard information, """Learn from a batch of documents and gold-standard information,
updating the pipe's model. Delegates to predict and get_loss. updating the pipe's model. Delegates to predict and get_loss.
@ -186,8 +186,8 @@ cdef class TrainablePipe(Pipe):
def rehearse(self, def rehearse(self,
examples: Iterable[Example], examples: Iterable[Example],
*, *,
sgd: Optimizer=None, sgd: Optimizer = None,
losses: Dict[str, float]=None, losses: Dict[str, float] = None,
**config) -> Dict[str, float]: **config) -> Dict[str, float]:
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates """Perform a "rehearsal" update from a batch of data. Rehearsal updates
teach the current model to make predictions similar to an initial model, teach the current model to make predictions similar to an initial model,
@ -238,7 +238,7 @@ cdef class TrainablePipe(Pipe):
""" """
return util.create_default_optimizer() return util.create_default_optimizer()
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None): def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language = None):
"""Initialize the pipe for training, using data examples if available. """Initialize the pipe for training, using data examples if available.
This method needs to be implemented by each TrainablePipe component, This method needs to be implemented by each TrainablePipe component,
ensuring the internal model (if available) is initialized properly ensuring the internal model (if available) is initialized properly

View File

@ -8,58 +8,35 @@ from cymem.cymem cimport Pool
from itertools import islice from itertools import islice
from libc.stdlib cimport calloc, free
from libc.string cimport memcpy, memset
from libcpp.vector cimport vector
import contextlib import contextlib
import random import random
import warnings
import numpy import numpy
import numpy.random import numpy.random
import srsly import srsly
from thinc.api import (
CupyOps, from thinc.api import CupyOps, NumpyOps, set_dropout_rate
NumpyOps,
Optimizer,
chain,
get_array_module,
get_ops,
set_dropout_rate,
softmax_activation,
use_ops,
)
from thinc.legacy import LegacySequenceCategoricalCrossentropy
from thinc.types import Floats2d, Ints1d from thinc.types import Floats2d, Ints1d
from ..ml.tb_framework import TransitionModelInputs from ..ml.tb_framework import TransitionModelInputs
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ._parser_internals cimport _beam_utils from ._parser_internals cimport _beam_utils
from ._parser_internals.search cimport Beam
from ._parser_internals.stateclass cimport StateC, StateClass from ._parser_internals.stateclass cimport StateC, StateClass
from .trainable_pipe cimport TrainablePipe from .trainable_pipe cimport TrainablePipe
from ._parser_internals import _beam_utils
from ..typedefs cimport weight_t from ..typedefs cimport weight_t
from ..vocab cimport Vocab from ..vocab cimport Vocab
from ._parser_internals.transition_system cimport Transition, TransitionSystem from ._parser_internals.transition_system cimport Transition, TransitionSystem
from .. import util from .. import util
from ..errors import Errors, Warnings from ..errors import Errors
from ..training import ( from ..training import (
validate_distillation_examples, validate_distillation_examples,
validate_examples, validate_examples,
validate_get_examples, validate_get_examples,
) )
from ._parser_internals import _beam_utils
# TODO: Remove when we switch to Cython 3.
cdef extern from "<algorithm>" namespace "std" nogil:
bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
NUMPY_OPS = NumpyOps() NUMPY_OPS = NumpyOps()
@ -384,7 +361,6 @@ class Parser(TrainablePipe):
except Exception as e: except Exception as e:
error_handler(self.name, self, batch_in_order, e) error_handler(self.name, self, batch_in_order, e)
def predict(self, docs): def predict(self, docs):
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
@ -414,7 +390,6 @@ class Parser(TrainablePipe):
def set_annotations(self, docs, states_or_beams): def set_annotations(self, docs, states_or_beams):
cdef StateClass state cdef StateClass state
cdef Beam beam
cdef Doc doc cdef Doc doc
states = _beam_utils.collect_states(states_or_beams, docs) states = _beam_utils.collect_states(states_or_beams, docs)
for i, (state, doc) in enumerate(zip(states, docs)): for i, (state, doc) in enumerate(zip(states, docs)):
@ -423,7 +398,6 @@ class Parser(TrainablePipe):
hook(doc) hook(doc)
def update(self, examples, *, drop=0., sgd=None, losses=None): def update(self, examples, *, drop=0., sgd=None, losses=None):
cdef StateClass state
if losses is None: if losses is None:
losses = {} losses = {}
losses.setdefault(self.name, 0.) losses.setdefault(self.name, 0.)

View File

@ -4,7 +4,6 @@ from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Uni
cimport cython cimport cython
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from libc.string cimport memcpy from libc.string cimport memcpy
from libcpp.set cimport set
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
import srsly import srsly

View File

@ -52,7 +52,8 @@ TEST_PATTERNS = [
@pytest.mark.parametrize( @pytest.mark.parametrize(
"pattern", [[{"XX": "y"}, {"LENGTH": "2"}, {"TEXT": {"IN": 5}}]] "pattern",
[[{"XX": "y"}], [{"LENGTH": "2"}], [{"TEXT": {"IN": 5}}], [{"text": {"in": 6}}]],
) )
def test_matcher_pattern_validation(en_vocab, pattern): def test_matcher_pattern_validation(en_vocab, pattern):
matcher = Matcher(en_vocab, validate=True) matcher = Matcher(en_vocab, validate=True)

View File

@ -11,6 +11,7 @@ def test_build_dependencies():
"flake8", "flake8",
"hypothesis", "hypothesis",
"pre-commit", "pre-commit",
"cython-lint",
"black", "black",
"isort", "isort",
"mypy", "mypy",

View File

@ -230,10 +230,10 @@ def test_overfitting_IO():
# Test scoring # Test scoring
scores = nlp.evaluate(train_examples) scores = nlp.evaluate(train_examples)
assert f"span_finder_{SPANS_KEY}_f" in scores assert f"spans_{SPANS_KEY}_f" in scores
# It's not perfect 1.0 F1 because it's designed to overgenerate for now. # It's not perfect 1.0 F1 because it's designed to overgenerate for now.
assert scores[f"span_finder_{SPANS_KEY}_p"] == 0.75 assert scores[f"spans_{SPANS_KEY}_p"] == 0.75
assert scores[f"span_finder_{SPANS_KEY}_r"] == 1.0 assert scores[f"spans_{SPANS_KEY}_r"] == 1.0
# also test that the spancat works for just a single entity in a sentence # also test that the spancat works for just a single entity in a sentence
doc = nlp("London") doc = nlp("London")

View File

@ -192,8 +192,7 @@ def test_tok2vec_listener(with_vectors):
for tag in t[1]["tags"]: for tag in t[1]["tags"]:
tagger.add_label(tag) tagger.add_label(tag)
# Check that the Tok2Vec component finds it listeners # Check that the Tok2Vec component finds its listeners
assert tok2vec.listeners == []
optimizer = nlp.initialize(lambda: train_examples) optimizer = nlp.initialize(lambda: train_examples)
assert tok2vec.listeners == [tagger_tok2vec] assert tok2vec.listeners == [tagger_tok2vec]
@ -221,7 +220,6 @@ def test_tok2vec_listener_callback():
assert nlp.pipe_names == ["tok2vec", "tagger"] assert nlp.pipe_names == ["tok2vec", "tagger"]
tagger = nlp.get_pipe("tagger") tagger = nlp.get_pipe("tagger")
tok2vec = nlp.get_pipe("tok2vec") tok2vec = nlp.get_pipe("tok2vec")
nlp._link_components()
docs = [nlp.make_doc("A random sentence")] docs = [nlp.make_doc("A random sentence")]
tok2vec.model.initialize(X=docs) tok2vec.model.initialize(X=docs)
gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs] gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs]
@ -430,29 +428,46 @@ def test_replace_listeners_from_config():
nlp.to_disk(dir_path) nlp.to_disk(dir_path)
base_model = str(dir_path) base_model = str(dir_path)
new_config = { new_config = {
"nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]}, "nlp": {
"lang": "en",
"pipeline": ["tok2vec", "tagger2", "ner3", "tagger4"],
},
"components": { "components": {
"tok2vec": {"source": base_model}, "tok2vec": {"source": base_model},
"tagger": { "tagger2": {
"source": base_model, "source": base_model,
"component": "tagger",
"replace_listeners": ["model.tok2vec"], "replace_listeners": ["model.tok2vec"],
}, },
"ner": {"source": base_model}, "ner3": {
"source": base_model,
"component": "ner",
},
"tagger4": {
"source": base_model,
"component": "tagger",
},
}, },
} }
new_nlp = util.load_model_from_config(new_config, auto_fill=True) new_nlp = util.load_model_from_config(new_config, auto_fill=True)
new_nlp.initialize(lambda: examples) new_nlp.initialize(lambda: examples)
tok2vec = new_nlp.get_pipe("tok2vec") tok2vec = new_nlp.get_pipe("tok2vec")
tagger = new_nlp.get_pipe("tagger") tagger = new_nlp.get_pipe("tagger2")
ner = new_nlp.get_pipe("ner") ner = new_nlp.get_pipe("ner3")
assert tok2vec.listening_components == ["ner"] assert "ner" not in new_nlp.pipe_names
assert "tagger" not in new_nlp.pipe_names
assert tok2vec.listening_components == ["ner3", "tagger4"]
assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk()) assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk())
assert not any(isinstance(node, Tok2VecListener) for node in tagger.model.walk()) assert not any(isinstance(node, Tok2VecListener) for node in tagger.model.walk())
t2v_cfg = new_nlp.config["components"]["tok2vec"]["model"] t2v_cfg = new_nlp.config["components"]["tok2vec"]["model"]
assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2" assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2"
assert new_nlp.config["components"]["tagger"]["model"]["tok2vec"] == t2v_cfg assert new_nlp.config["components"]["tagger2"]["model"]["tok2vec"] == t2v_cfg
assert ( assert (
new_nlp.config["components"]["ner"]["model"]["tok2vec"]["@architectures"] new_nlp.config["components"]["ner3"]["model"]["tok2vec"]["@architectures"]
== "spacy.Tok2VecListener.v1"
)
assert (
new_nlp.config["components"]["tagger4"]["model"]["tok2vec"]["@architectures"]
== "spacy.Tok2VecListener.v1" == "spacy.Tok2VecListener.v1"
) )
@ -627,3 +642,57 @@ def test_tok2vec_distillation_teacher_annotations():
student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec) student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec)
student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={}) student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={})
def test_tok2vec_listener_source_link_name():
"""The component's internal name and the tok2vec listener map correspond
to the most recently modified pipeline.
"""
orig_config = Config().from_str(cfg_string_multi)
nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
nlp2 = English()
nlp2.add_pipe("tok2vec", source=nlp1)
nlp2.add_pipe("tagger", name="tagger2", source=nlp1)
# there is no way to have the component have the right name for both
# pipelines, right now the most recently modified pipeline is prioritized
assert nlp1.get_pipe("tagger").name == nlp2.get_pipe("tagger2").name == "tagger2"
# there is no way to have the tok2vec have the right listener map for both
# pipelines, right now the most recently modified pipeline is prioritized
assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
nlp2.add_pipe("ner", name="ner3", source=nlp1)
assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2", "ner3"]
nlp2.remove_pipe("ner3")
assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
nlp2.remove_pipe("tagger2")
assert nlp2.get_pipe("tok2vec").listening_components == []
# at this point the tok2vec component corresponds to nlp2
assert nlp1.get_pipe("tok2vec").listening_components == []
# modifying the nlp1 pipeline syncs the tok2vec listener map back to nlp1
nlp1.add_pipe("sentencizer")
assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
# modifying nlp2 syncs it back to nlp2
nlp2.add_pipe("sentencizer")
assert nlp1.get_pipe("tok2vec").listening_components == []
def test_tok2vec_listener_source_replace_listeners():
orig_config = Config().from_str(cfg_string_multi)
nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
nlp1.replace_listeners("tok2vec", "tagger", ["model.tok2vec"])
assert nlp1.get_pipe("tok2vec").listening_components == ["ner"]
nlp2 = English()
nlp2.add_pipe("tok2vec", source=nlp1)
assert nlp2.get_pipe("tok2vec").listening_components == []
nlp2.add_pipe("tagger", source=nlp1)
assert nlp2.get_pipe("tok2vec").listening_components == []
nlp2.add_pipe("ner", name="ner2", source=nlp1)
assert nlp2.get_pipe("tok2vec").listening_components == ["ner2"]

View File

@ -469,6 +469,55 @@ def test_config_overrides():
assert nlp.pipe_names == ["tok2vec", "tagger"] assert nlp.pipe_names == ["tok2vec", "tagger"]
@pytest.mark.filterwarnings("ignore:\\[W036")
def test_config_overrides_registered_functions():
nlp = spacy.blank("en")
nlp.add_pipe("attribute_ruler")
with make_tempdir() as d:
nlp.to_disk(d)
nlp_re1 = spacy.load(
d,
config={
"components": {
"attribute_ruler": {
"scorer": {"@scorers": "spacy.tagger_scorer.v1"}
}
}
},
)
assert (
nlp_re1.config["components"]["attribute_ruler"]["scorer"]["@scorers"]
== "spacy.tagger_scorer.v1"
)
@registry.misc("test_some_other_key")
def misc_some_other_key():
return "some_other_key"
nlp_re2 = spacy.load(
d,
config={
"components": {
"attribute_ruler": {
"scorer": {
"@scorers": "spacy.overlapping_labeled_spans_scorer.v1",
"spans_key": {"@misc": "test_some_other_key"},
}
}
}
},
)
assert nlp_re2.config["components"]["attribute_ruler"]["scorer"][
"spans_key"
] == {"@misc": "test_some_other_key"}
# run dummy evaluation (will return None scores) in order to test that
# the spans_key value in the nested override is working as intended in
# the config
example = Example.from_dict(nlp_re2.make_doc("a b c"), {})
scores = nlp_re2.evaluate([example])
assert "spans_some_other_key_f" in scores
def test_config_interpolation(): def test_config_interpolation():
config = Config().from_str(nlp_config_string, interpolate=False) config = Config().from_str(nlp_config_string, interpolate=False)
assert config["corpora"]["train"]["path"] == "${paths.train}" assert config["corpora"]["train"]["path"] == "${paths.train}"

View File

@ -697,7 +697,6 @@ def test_string_to_list_intify(value):
assert string_to_list(value, intify=True) == [1, 2, 3] assert string_to_list(value, intify=True) == [1, 2, 3]
@pytest.mark.skip(reason="Temporarily skip before models are published")
def test_download_compatibility(): def test_download_compatibility():
spec = SpecifierSet("==" + about.__version__) spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False spec.prereleases = False
@ -708,7 +707,6 @@ def test_download_compatibility():
assert get_minor_version(about.__version__) == get_minor_version(version) assert get_minor_version(about.__version__) == get_minor_version(version)
@pytest.mark.skip(reason="Temporarily skip before models are published")
def test_validate_compatibility_table(): def test_validate_compatibility_table():
spec = SpecifierSet("==" + about.__version__) spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False spec.prereleases = False

View File

@ -377,3 +377,22 @@ def test_displacy_manual_sorted_entities():
html = displacy.render(doc, style="ent", manual=True) html = displacy.render(doc, style="ent", manual=True)
assert html.find("FIRST") < html.find("SECOND") assert html.find("FIRST") < html.find("SECOND")
@pytest.mark.issue(12816)
def test_issue12816(en_vocab) -> None:
"""Test that displaCy's span visualizer escapes annotated HTML tags correctly."""
# Create a doc containing an annotated word and an unannotated HTML tag
doc = Doc(en_vocab, words=["test", "<TEST>"])
doc.spans["sc"] = [Span(doc, 0, 1, label="test")]
# Verify that the HTML tag is escaped when unannotated
html = displacy.render(doc, style="span")
assert "&lt;TEST&gt;" in html
# Annotate the HTML tag
doc.spans["sc"].append(Span(doc, 1, 2, label="test"))
# Verify that the HTML tag is still escaped
html = displacy.render(doc, style="span")
assert "&lt;TEST&gt;" in html

View File

@ -220,6 +220,10 @@ def test_minor_version(a1, a2, b1, b2, is_match):
{"training.batch_size": 128, "training.optimizer.learn_rate": 0.01}, {"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
{"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}}, {"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
), ),
(
{"attribute_ruler.scorer.@scorers": "spacy.tagger_scorer.v1"},
{"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}},
),
], ],
) )
def test_dot_to_dict(dot_notation, expected): def test_dot_to_dict(dot_notation, expected):
@ -228,6 +232,29 @@ def test_dot_to_dict(dot_notation, expected):
assert util.dict_to_dot(result) == dot_notation assert util.dict_to_dot(result) == dot_notation
@pytest.mark.parametrize(
"dot_notation,expected",
[
(
{"token.pos": True, "token._.xyz": True},
{"token": {"pos": True, "_": {"xyz": True}}},
),
(
{"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
{"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
),
(
{"attribute_ruler.scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
{"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}},
),
],
)
def test_dot_to_dict_overrides(dot_notation, expected):
result = util.dot_to_dict(dot_notation)
assert result == expected
assert util.dict_to_dot(result, for_overrides=True) == dot_notation
def test_set_dot_to_object(): def test_set_dot_to_object():
config = {"foo": {"bar": 1, "baz": {"x": "y"}}, "test": {"a": {"b": "c"}}} config = {"foo": {"bar": 1, "baz": {"x": "y"}}, "test": {"a": {"b": "c"}}}
with pytest.raises(KeyError): with pytest.raises(KeyError):

View File

@ -401,6 +401,7 @@ def test_vectors_serialize():
row_r = v_r.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f")) row_r = v_r.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f"))
assert row == row_r assert row == row_r
assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data)) assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data))
assert v.attr == v_r.attr
def test_vector_is_oov(): def test_vector_is_oov():
@ -645,3 +646,32 @@ def test_equality():
vectors1.resize((5, 9)) vectors1.resize((5, 9))
vectors2.resize((5, 9)) vectors2.resize((5, 9))
assert vectors1 == vectors2 assert vectors1 == vectors2
def test_vectors_attr():
data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
# default ORTH
nlp = English()
nlp.vocab.vectors = Vectors(data=data, keys=["A", "B", "C"])
assert nlp.vocab.strings["A"] in nlp.vocab.vectors.key2row
assert nlp.vocab.strings["a"] not in nlp.vocab.vectors.key2row
assert nlp.vocab["A"].has_vector is True
assert nlp.vocab["a"].has_vector is False
assert nlp("A")[0].has_vector is True
assert nlp("a")[0].has_vector is False
# custom LOWER
nlp = English()
nlp.vocab.vectors = Vectors(data=data, keys=["a", "b", "c"], attr="LOWER")
assert nlp.vocab.strings["A"] not in nlp.vocab.vectors.key2row
assert nlp.vocab.strings["a"] in nlp.vocab.vectors.key2row
assert nlp.vocab["A"].has_vector is True
assert nlp.vocab["a"].has_vector is True
assert nlp("A")[0].has_vector is True
assert nlp("a")[0].has_vector is True
# add a new vectors entry
assert nlp.vocab["D"].has_vector is False
assert nlp.vocab["d"].has_vector is False
nlp.vocab.set_vector("D", numpy.asarray([4, 5, 6]))
assert nlp.vocab["D"].has_vector is True
assert nlp.vocab["d"].has_vector is True

View File

@ -26,24 +26,57 @@ cdef class Tokenizer:
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases) cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
cdef int _apply_special_cases(self, Doc doc) except -1 cdef int _apply_special_cases(self, Doc doc) except -1
cdef void _filter_special_spans(self, vector[SpanC] &original, cdef void _filter_special_spans(
vector[SpanC] &filtered, int doc_len) nogil self,
cdef object _prepare_special_spans(self, Doc doc, vector[SpanC] &original,
vector[SpanC] &filtered) vector[SpanC] &filtered,
cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens, int doc_len,
object span_data) ) nogil
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens, cdef object _prepare_special_spans(
self,
Doc doc,
vector[SpanC] &filtered,
)
cdef int _retokenize_special_spans(
self,
Doc doc,
TokenC* tokens,
object span_data,
)
cdef int _try_specials_and_cache(
self,
hash_t key,
Doc tokens,
int* has_special, int* has_special,
bint with_special_cases) except -1 bint with_special_cases,
cdef int _tokenize(self, Doc tokens, str span, hash_t key, ) except -1
int* has_special, bint with_special_cases) except -1 cdef int _tokenize(
cdef str _split_affixes(self, str string, self,
Doc tokens,
str span,
hash_t key,
int* has_special,
bint with_special_cases,
) except -1
cdef str _split_affixes(
self,
str string,
vector[LexemeC*] *prefixes, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes, int* has_special, vector[LexemeC*] *suffixes, int* has_special,
bint with_special_cases) bint with_special_cases,
cdef int _attach_tokens(self, Doc tokens, str string, )
cdef int _attach_tokens(
self,
Doc tokens,
str string,
vector[LexemeC*] *prefixes, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes, int* has_special, vector[LexemeC*] *suffixes, int* has_special,
bint with_special_cases) except -1 bint with_special_cases,
cdef int _save_cached(self, const TokenC* tokens, hash_t key, ) except -1
int* has_special, int n) except -1 cdef int _save_cached(
self,
const TokenC* tokens,
hash_t key,
int* has_special,
int n,
) except -1

View File

@ -323,7 +323,7 @@ cdef class Tokenizer:
cdef int span_start cdef int span_start
cdef int span_end cdef int span_end
while i < doc.length: while i < doc.length:
if not i in span_data: if i not in span_data:
tokens[i + offset] = doc.c[i] tokens[i + offset] = doc.c[i]
i += 1 i += 1
else: else:
@ -394,12 +394,14 @@ cdef class Tokenizer:
self._save_cached(&tokens.c[orig_size], orig_key, has_special, self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size) tokens.length - orig_size)
cdef str _split_affixes(self, str string, cdef str _split_affixes(
self,
str string,
vector[const LexemeC*] *prefixes, vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes, vector[const LexemeC*] *suffixes,
int* has_special, int* has_special,
bint with_special_cases): bint with_special_cases
cdef size_t i ):
cdef str prefix cdef str prefix
cdef str suffix cdef str suffix
cdef str minus_pre cdef str minus_pre
@ -444,10 +446,6 @@ cdef class Tokenizer:
vector[const LexemeC*] *suffixes, vector[const LexemeC*] *suffixes,
int* has_special, int* has_special,
bint with_special_cases) except -1: bint with_special_cases) except -1:
cdef bint specials_hit = 0
cdef bint cache_hit = 0
cdef int split, end
cdef const LexemeC* const* lexemes
cdef const LexemeC* lexeme cdef const LexemeC* lexeme
cdef str span cdef str span
cdef int i cdef int i
@ -457,9 +455,11 @@ cdef class Tokenizer:
if string: if string:
if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases): if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
pass pass
elif (self.token_match and self.token_match(string)) or \ elif (
(self.url_match and \ (self.token_match and self.token_match(string)) or
self.url_match(string)): (self.url_match and self.url_match(string))
):
# We're always saying 'no' to spaces here -- the caller will # We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original. # fix up the outermost one, with reference to the original.
# See Issue #859 # See Issue #859
@ -820,7 +820,7 @@ cdef class Tokenizer:
self.infix_finditer = None self.infix_finditer = None
self.token_match = None self.token_match = None
self.url_match = None self.url_match = None
msg = util.from_bytes(bytes_data, deserializers, exclude) util.from_bytes(bytes_data, deserializers, exclude)
if "prefix_search" in data and isinstance(data["prefix_search"], str): if "prefix_search" in data and isinstance(data["prefix_search"], str):
self.prefix_search = re.compile(data["prefix_search"]).search self.prefix_search = re.compile(data["prefix_search"]).search
if "suffix_search" in data and isinstance(data["suffix_search"], str): if "suffix_search" in data and isinstance(data["suffix_search"], str):

View File

@ -31,7 +31,7 @@ cdef int token_by_start(const TokenC* tokens, int length, int start_char) except
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2 cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
cdef int [:,:] _get_lca_matrix(Doc, int start, int end) cdef int [:, :] _get_lca_matrix(Doc, int start, int end)
cdef class Doc: cdef class Doc:
@ -61,7 +61,6 @@ cdef class Doc:
cdef int length cdef int length
cdef int max_length cdef int max_length
cdef public object noun_chunks_iterator cdef public object noun_chunks_iterator
cdef object __weakref__ cdef object __weakref__

View File

@ -35,6 +35,7 @@ from ..attrs cimport (
LENGTH, LENGTH,
MORPH, MORPH,
NORM, NORM,
ORTH,
POS, POS,
SENT_START, SENT_START,
SPACY, SPACY,
@ -42,14 +43,13 @@ from ..attrs cimport (
attr_id_t, attr_id_t,
) )
from ..lexeme cimport EMPTY_LEXEME, Lexeme from ..lexeme cimport EMPTY_LEXEME, Lexeme
from ..typedefs cimport attr_t, flags_t from ..typedefs cimport attr_t
from .token cimport Token from .token cimport Token
from .. import parts_of_speech, schemas, util from .. import parts_of_speech, schemas, util
from ..attrs import IDS, intify_attr from ..attrs import IDS, intify_attr
from ..compat import copy_reg, pickle from ..compat import copy_reg
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..morphology import Morphology
from ..util import get_words_and_spaces from ..util import get_words_and_spaces
from .retokenizer import Retokenizer from .retokenizer import Retokenizer
from .underscore import Underscore, get_ext_args from .underscore import Underscore, get_ext_args
@ -613,13 +613,26 @@ cdef class Doc:
""" """
if "similarity" in self.user_hooks: if "similarity" in self.user_hooks:
return self.user_hooks["similarity"](self, other) return self.user_hooks["similarity"](self, other)
if isinstance(other, (Lexeme, Token)) and self.length == 1: attr = getattr(self.vocab.vectors, "attr", ORTH)
if self.c[0].lex.orth == other.orth: cdef Token this_token
cdef Token other_token
cdef Lexeme other_lex
if len(self) == 1 and isinstance(other, Token):
this_token = self[0]
other_token = other
if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
return 1.0 return 1.0
elif isinstance(other, (Span, Doc)) and len(self) == len(other): elif len(self) == 1 and isinstance(other, Lexeme):
this_token = self[0]
other_lex = other
if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
return 1.0
elif isinstance(other, (Doc, Span)) and len(self) == len(other):
similar = True similar = True
for i in range(self.length): for i in range(len(self)):
if self[i].orth != other[i].orth: this_token = self[i]
other_token = other[i]
if Token.get_struct_attr(this_token.c, attr) != Token.get_struct_attr(other_token.c, attr):
similar = False similar = False
break break
if similar: if similar:
@ -767,7 +780,7 @@ cdef class Doc:
# TODO: # TODO:
# 1. Test basic data-driven ORTH gazetteer # 1. Test basic data-driven ORTH gazetteer
# 2. Test more nuanced date and currency regex # 2. Test more nuanced date and currency regex
cdef attr_t entity_type, kb_id, ent_id cdef attr_t kb_id, ent_id
cdef int ent_start, ent_end cdef int ent_start, ent_end
ent_spans = [] ent_spans = []
for ent_info in ents: for ent_info in ents:
@ -975,7 +988,6 @@ cdef class Doc:
>>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) >>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
""" """
cdef int i, j cdef int i, j
cdef attr_id_t feature
cdef np.ndarray[attr_t, ndim=2] output cdef np.ndarray[attr_t, ndim=2] output
# Handle scalar/list inputs of strings/ints for py_attr_ids # Handle scalar/list inputs of strings/ints for py_attr_ids
# See also #3064 # See also #3064
@ -987,8 +999,10 @@ cdef class Doc:
py_attr_ids = [py_attr_ids] py_attr_ids = [py_attr_ids]
# Allow strings, e.g. 'lemma' or 'LEMMA' # Allow strings, e.g. 'lemma' or 'LEMMA'
try: try:
py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) py_attr_ids = [
for id_ in py_attr_ids] (IDS[id_.upper()] if hasattr(id_, "upper") else id_)
for id_ in py_attr_ids
]
except KeyError as msg: except KeyError as msg:
keys = list(IDS.keys()) keys = list(IDS.keys())
raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
@ -1022,8 +1036,6 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#count_by DOCS: https://spacy.io/api/doc#count_by
""" """
cdef int i cdef int i
cdef attr_t attr
cdef size_t count
if counts is None: if counts is None:
counts = Counter() counts = Counter()
@ -1085,7 +1097,6 @@ cdef class Doc:
cdef int i, col cdef int i, col
cdef int32_t abs_head_index cdef int32_t abs_head_index
cdef attr_id_t attr_id cdef attr_id_t attr_id
cdef TokenC* tokens = self.c
cdef int length = len(array) cdef int length = len(array)
if length != len(self): if length != len(self):
raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self))) raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
@ -1505,7 +1516,6 @@ cdef class Doc:
attributes are inherited from the syntactic root of the span. attributes are inherited from the syntactic root of the span.
RETURNS (Token): The first newly merged token. RETURNS (Token): The first newly merged token.
""" """
cdef str tag, lemma, ent_type
attr_len = len(attributes) attr_len = len(attributes)
span_len = len(spans) span_len = len(spans)
if not attr_len == span_len: if not attr_len == span_len:
@ -1621,7 +1631,6 @@ cdef class Doc:
for token in char_span[1:]: for token in char_span[1:]:
token.is_sent_start = False token.is_sent_start = False
for span_group in doc_json.get("spans", {}): for span_group in doc_json.get("spans", {}):
spans = [] spans = []
for span in doc_json["spans"][span_group]: for span in doc_json["spans"][span_group]:
@ -1773,7 +1782,6 @@ cdef class Doc:
output.fill(255) output.fill(255)
cdef int i, j, start_idx, end_idx cdef int i, j, start_idx, end_idx
cdef bytes byte_string cdef bytes byte_string
cdef unsigned char utf8_char
for i, byte_string in enumerate(byte_strings): for i, byte_string in enumerate(byte_strings):
j = 0 j = 0
start_idx = 0 start_idx = 0
@ -1826,8 +1834,6 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1: cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
# note: end is exclusive # note: end is exclusive
cdef TokenC* head
cdef TokenC* child
cdef int i cdef int i
# Set number of left/right children to 0. We'll increment it in the loops. # Set number of left/right children to 0. We'll increment it in the loops.
for i in range(start, end): for i in range(start, end):
@ -1927,7 +1933,7 @@ cdef int _get_tokens_lca(Token token_j, Token token_k):
return -1 return -1
cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): cdef int [:, :] _get_lca_matrix(Doc doc, int start, int end):
"""Given a doc and a start and end position defining a set of contiguous """Given a doc and a start and end position defining a set of contiguous
tokens within it, returns a matrix of Lowest Common Ancestors (LCA), where tokens within it, returns a matrix of Lowest Common Ancestors (LCA), where
LCA[i, j] is the index of the lowest common ancestor among token i and j. LCA[i, j] is the index of the lowest common ancestor among token i and j.
@ -1940,7 +1946,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
RETURNS (int [:, :]): memoryview of numpy.array[ndim=2, dtype=numpy.int32], RETURNS (int [:, :]): memoryview of numpy.array[ndim=2, dtype=numpy.int32],
with shape (n, n), where n = len(doc). with shape (n, n), where n = len(doc).
""" """
cdef int [:,:] lca_matrix cdef int [:, :] lca_matrix
cdef int j, k cdef int j, k
n_tokens= end - start n_tokens= end - start
lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32) lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)

View File

@ -3,7 +3,7 @@ from typing import Generator, List, Tuple
cimport cython cimport cython
from cython.operator cimport dereference from cython.operator cimport dereference
from libc.stdint cimport int32_t, int64_t from libc.stdint cimport int32_t
from libcpp.pair cimport pair from libcpp.pair cimport pair
from libcpp.unordered_map cimport unordered_map from libcpp.unordered_map cimport unordered_map
from libcpp.unordered_set cimport unordered_set from libcpp.unordered_set cimport unordered_set
@ -11,7 +11,6 @@ from libcpp.unordered_set cimport unordered_set
import weakref import weakref
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from preshed.maps cimport map_get_unless_missing
from .. import Errors from .. import Errors
@ -370,7 +369,9 @@ cdef class Graph:
>>> assert graph.has_node((0,)) >>> assert graph.has_node((0,))
>>> assert graph.has_edge((0,), (1,3), label="agent") >>> assert graph.has_edge((0,), (1,3), label="agent")
""" """
def __init__(self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None): def __init__(
self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None # no-cython-lint
):
"""Create a Graph object. """Create a Graph object.
doc (Doc): The Doc object the graph will refer to. doc (Doc): The Doc object the graph will refer to.
@ -441,8 +442,6 @@ cdef class Graph:
be returned, and no new edge will be created. The weight of the edge be returned, and no new edge will be created. The weight of the edge
will be updated if a weight is specified. will be updated if a weight is specified.
""" """
label_hash = self.doc.vocab.strings.as_int(label)
weight_float = weight if weight is not None else 0.0
edge_index = add_edge( edge_index = add_edge(
&self.c, &self.c,
EdgeC( EdgeC(

View File

@ -94,4 +94,3 @@ cdef class MorphAnalysis:
def __repr__(self): def __repr__(self):
return self.to_json() return self.to_json()

View File

@ -1,7 +1,6 @@
# cython: infer_types=True, bounds_check=False, profile=True # cython: infer_types=True, bounds_check=False, profile=True
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from libc.stdlib cimport free, malloc from libc.string cimport memset
from libc.string cimport memcpy, memset
import numpy import numpy
from thinc.api import get_array_module from thinc.api import get_array_module
@ -10,7 +9,7 @@ from ..attrs cimport MORPH, NORM
from ..lexeme cimport EMPTY_LEXEME, Lexeme from ..lexeme cimport EMPTY_LEXEME, Lexeme
from ..structs cimport LexemeC, TokenC from ..structs cimport LexemeC, TokenC
from ..vocab cimport Vocab from ..vocab cimport Vocab
from .doc cimport Doc, set_children_from_heads, token_by_end, token_by_start from .doc cimport Doc, set_children_from_heads, token_by_start
from .span cimport Span from .span cimport Span
from .token cimport Token from .token cimport Token
@ -148,7 +147,7 @@ def _merge(Doc doc, merges):
syntactic root of the span. syntactic root of the span.
RETURNS (Token): The first newly merged token. RETURNS (Token): The first newly merged token.
""" """
cdef int i, merge_index, start, end, token_index, current_span_index, current_offset, offset, span_index cdef int i, merge_index, start, token_index, current_span_index, current_offset, offset, span_index
cdef Span span cdef Span span
cdef const LexemeC* lex cdef const LexemeC* lex
cdef TokenC* token cdef TokenC* token
@ -166,7 +165,6 @@ def _merge(Doc doc, merges):
merges.sort(key=_get_start) merges.sort(key=_get_start)
for merge_index, (span, attributes) in enumerate(merges): for merge_index, (span, attributes) in enumerate(merges):
start = span.start start = span.start
end = span.end
spans.append(span) spans.append(span)
# House the new merged token where it starts # House the new merged token where it starts
token = &doc.c[start] token = &doc.c[start]
@ -204,8 +202,9 @@ def _merge(Doc doc, merges):
# for the merged region. To do this, we create a boolean array indicating # for the merged region. To do this, we create a boolean array indicating
# whether the row is to be deleted, then use numpy.delete # whether the row is to be deleted, then use numpy.delete
if doc.tensor is not None and doc.tensor.size != 0: if doc.tensor is not None and doc.tensor.size != 0:
doc.tensor = _resize_tensor(doc.tensor, doc.tensor = _resize_tensor(
[(m[0].start, m[0].end) for m in merges]) doc.tensor, [(m[0].start, m[0].end) for m in merges]
)
# Memorize span roots and sets dependencies of the newly merged # Memorize span roots and sets dependencies of the newly merged
# tokens to the dependencies of their roots. # tokens to the dependencies of their roots.
span_roots = [] span_roots = []
@ -346,7 +345,11 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
if to_process_tensor: if to_process_tensor:
xp = get_array_module(doc.tensor) xp = get_array_module(doc.tensor)
if xp is numpy: if xp is numpy:
doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0) doc.tensor = xp.append(
doc.tensor,
xp.zeros((nb_subtokens, doc.tensor.shape[1]), dtype="float32"),
axis=0
)
else: else:
shape = (doc.tensor.shape[0] + nb_subtokens, doc.tensor.shape[1]) shape = (doc.tensor.shape[0] + nb_subtokens, doc.tensor.shape[1])
resized_array = xp.zeros(shape, dtype="float32") resized_array = xp.zeros(shape, dtype="float32")
@ -368,7 +371,8 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
token.norm = 0 # reset norm token.norm = 0 # reset norm
if to_process_tensor: if to_process_tensor:
# setting the tensors of the split tokens to array of zeros # setting the tensors of the split tokens to array of zeros
doc.tensor[token_index + i:token_index + i + 1] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32") doc.tensor[token_index + i:token_index + i + 1] = \
xp.zeros((1, doc.tensor.shape[1]), dtype="float32")
# Update the character offset of the subtokens # Update the character offset of the subtokens
if i != 0: if i != 0:
token.idx = orig_token.idx + idx_offset token.idx = orig_token.idx + idx_offset
@ -456,7 +460,6 @@ def normalize_token_attrs(Vocab vocab, attrs):
def set_token_attrs(Token py_token, attrs): def set_token_attrs(Token py_token, attrs):
cdef TokenC* token = py_token.c cdef TokenC* token = py_token.c
cdef const LexemeC* lex = token.lex cdef const LexemeC* lex = token.lex
cdef Doc doc = py_token.doc
# Assign attributes # Assign attributes
for attr_name, attr_value in attrs.items(): for attr_name, attr_value in attrs.items():
if attr_name == "_": # Set extension attributes if attr_name == "_": # Set extension attributes

View File

@ -1,5 +1,4 @@
cimport numpy as np cimport numpy as np
from libc.math cimport sqrt
from libcpp.memory cimport make_shared from libcpp.memory cimport make_shared
import copy import copy
@ -9,13 +8,13 @@ import numpy
from thinc.api import get_array_module from thinc.api import get_array_module
from ..attrs cimport * from ..attrs cimport *
from ..attrs cimport attr_id_t from ..attrs cimport ORTH, attr_id_t
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..parts_of_speech cimport univ_pos_t from ..structs cimport TokenC
from ..structs cimport LexemeC, TokenC
from ..symbols cimport dep from ..symbols cimport dep
from ..typedefs cimport attr_t, flags_t, hash_t from ..typedefs cimport attr_t
from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start from .doc cimport _get_lca_matrix, get_token_attr
from .token cimport Token
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..util import normalize_slice from ..util import normalize_slice
@ -371,13 +370,26 @@ cdef class Span:
""" """
if "similarity" in self.doc.user_span_hooks: if "similarity" in self.doc.user_span_hooks:
return self.doc.user_span_hooks["similarity"](self, other) return self.doc.user_span_hooks["similarity"](self, other)
if len(self) == 1 and hasattr(other, "orth"): attr = getattr(self.doc.vocab.vectors, "attr", ORTH)
if self[0].orth == other.orth: cdef Token this_token
cdef Token other_token
cdef Lexeme other_lex
if len(self) == 1 and isinstance(other, Token):
this_token = self[0]
other_token = other
if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
return 1.0
elif len(self) == 1 and isinstance(other, Lexeme):
this_token = self[0]
other_lex = other
if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
return 1.0 return 1.0
elif isinstance(other, (Doc, Span)) and len(self) == len(other): elif isinstance(other, (Doc, Span)) and len(self) == len(other):
similar = True similar = True
for i in range(len(self)): for i in range(len(self)):
if self[i].orth != getattr(other[i], "orth", None): this_token = self[i]
other_token = other[i]
if Token.get_struct_attr(this_token.c, attr) != Token.get_struct_attr(other_token.c, attr):
similar = False similar = False
break break
if similar: if similar:
@ -607,7 +619,6 @@ cdef class Span:
""" """
return "".join([t.text_with_ws for t in self]) return "".join([t.text_with_ws for t in self])
@property @property
def noun_chunks(self): def noun_chunks(self):
"""Iterate over the base noun phrases in the span. Yields base """Iterate over the base noun phrases in the span. Yields base

View File

@ -1,7 +1,7 @@
import struct import struct
import weakref import weakref
from copy import deepcopy from copy import deepcopy
from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Union from typing import Iterable, Optional, Union
import srsly import srsly
@ -36,7 +36,7 @@ cdef class SpanGroup:
DOCS: https://spacy.io/api/spangroup DOCS: https://spacy.io/api/spangroup
""" """
def __init__(self, doc, *, name="", attrs={}, spans=[]): def __init__(self, doc, *, name="", attrs={}, spans=[]): # no-cython-lint
"""Create a SpanGroup. """Create a SpanGroup.
doc (Doc): The reference Doc object. doc (Doc): The reference Doc object.
@ -315,7 +315,7 @@ cdef class SpanGroup:
other_attrs = deepcopy(other_group.attrs) other_attrs = deepcopy(other_group.attrs)
span_group.attrs.update({ span_group.attrs.update({
key: value for key, value in other_attrs.items() \ key: value for key, value in other_attrs.items()
if key not in span_group.attrs if key not in span_group.attrs
}) })
if len(other_group): if len(other_group):

View File

@ -26,7 +26,7 @@ cdef class Token:
cdef Token self = Token.__new__(Token, vocab, doc, offset) cdef Token self = Token.__new__(Token, vocab, doc, offset)
return self return self
#cdef inline TokenC struct_from_attrs(Vocab vocab, attrs): # cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
# cdef TokenC token # cdef TokenC token
# attrs = normalize_attrs(attrs) # attrs = normalize_attrs(attrs)
@ -98,12 +98,10 @@ cdef class Token:
elif feat_name == SENT_START: elif feat_name == SENT_START:
token.sent_start = value token.sent_start = value
@staticmethod @staticmethod
cdef inline int missing_dep(const TokenC* token) nogil: cdef inline int missing_dep(const TokenC* token) nogil:
return token.dep == MISSING_DEP return token.dep == MISSING_DEP
@staticmethod @staticmethod
cdef inline int missing_head(const TokenC* token) nogil: cdef inline int missing_head(const TokenC* token) nogil:
return Token.missing_dep(token) return Token.missing_dep(token)

View File

@ -1,13 +1,11 @@
# cython: infer_types=True # cython: infer_types=True
# Compiler crashes on memory view coercion without this. Should report bug. # Compiler crashes on memory view coercion without this. Should report bug.
cimport numpy as np cimport numpy as np
from cython.view cimport array as cvarray
np.import_array() np.import_array()
import warnings import warnings
import numpy
from thinc.api import get_array_module from thinc.api import get_array_module
from ..attrs cimport ( from ..attrs cimport (
@ -28,6 +26,7 @@ from ..attrs cimport (
LIKE_EMAIL, LIKE_EMAIL,
LIKE_NUM, LIKE_NUM,
LIKE_URL, LIKE_URL,
ORTH,
) )
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..symbols cimport conj from ..symbols cimport conj
@ -216,11 +215,17 @@ cdef class Token:
""" """
if "similarity" in self.doc.user_token_hooks: if "similarity" in self.doc.user_token_hooks:
return self.doc.user_token_hooks["similarity"](self, other) return self.doc.user_token_hooks["similarity"](self, other)
if hasattr(other, "__len__") and len(other) == 1 and hasattr(other, "__getitem__"): attr = getattr(self.doc.vocab.vectors, "attr", ORTH)
if self.c.lex.orth == getattr(other[0], "orth", None): cdef Token this_token = self
cdef Token other_token
cdef Lexeme other_lex
if isinstance(other, Token):
other_token = other
if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
return 1.0 return 1.0
elif hasattr(other, "orth"): elif isinstance(other, Lexeme):
if self.c.lex.orth == other.orth: other_lex = other
if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
return 1.0 return 1.0
if self.vocab.vectors.n_keys == 0: if self.vocab.vectors.n_keys == 0:
warnings.warn(Warnings.W007.format(obj="Token")) warnings.warn(Warnings.W007.format(obj="Token"))
@ -421,7 +426,7 @@ cdef class Token:
if "vector" in self.doc.user_token_hooks: if "vector" in self.doc.user_token_hooks:
return self.doc.user_token_hooks["vector"](self) return self.doc.user_token_hooks["vector"](self)
else: else:
return self.vocab.get_vector(self.c.lex.orth) return self.vocab.get_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
@property @property
def vector_norm(self): def vector_norm(self):
@ -528,9 +533,9 @@ cdef class Token:
def __get__(self): def __get__(self):
if self.i + 1 == len(self.doc): if self.i + 1 == len(self.doc):
return True return True
elif self.doc[self.i+1].is_sent_start == None: elif self.doc[self.i+1].is_sent_start is None:
return None return None
elif self.doc[self.i+1].is_sent_start == True: elif self.doc[self.i+1].is_sent_start is True:
return True return True
else: else:
return False return False

View File

@ -37,10 +37,14 @@ def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[Li
b2a.append(set()) b2a.append(set())
# Process the alignment at the current position # Process the alignment at the current position
if A[token_idx_a] == B[token_idx_b] and \ if A[token_idx_a] == B[token_idx_b] and \
(char_idx_a == 0 or \ (
char_to_token_a[char_idx_a - 1] < token_idx_a) and \ char_idx_a == 0 or
(char_idx_b == 0 or \ char_to_token_a[char_idx_a - 1] < token_idx_a
char_to_token_b[char_idx_b - 1] < token_idx_b): ) and \
(
char_idx_b == 0 or
char_to_token_b[char_idx_b - 1] < token_idx_b
):
# Current tokens are identical and both character offsets are the # Current tokens are identical and both character offsets are the
# start of a token (either at the beginning of the document or the # start of a token (either at the beginning of the document or the
# previous character belongs to a different token) # previous character belongs to a different token)

View File

@ -1,4 +1,3 @@
import warnings
from collections.abc import Iterable as IterableInstance from collections.abc import Iterable as IterableInstance
import numpy import numpy
@ -168,7 +167,6 @@ cdef class Example:
self._y_sig = y_sig self._y_sig = y_sig
return self._cached_alignment return self._cached_alignment
def _get_aligned_vectorized(self, align, gold_values): def _get_aligned_vectorized(self, align, gold_values):
# Fast path for Doc attributes/fields that are predominantly a single value, # Fast path for Doc attributes/fields that are predominantly a single value,
# i.e., TAG, POS, MORPH. # i.e., TAG, POS, MORPH.
@ -211,7 +209,6 @@ cdef class Example:
return output.tolist() return output.tolist()
def _get_aligned_non_vectorized(self, align, gold_values): def _get_aligned_non_vectorized(self, align, gold_values):
# Slower path for fields that return multiple values (resulting # Slower path for fields that return multiple values (resulting
# in ragged arrays that cannot be vectorized trivially). # in ragged arrays that cannot be vectorized trivially).
@ -228,7 +225,6 @@ cdef class Example:
return output return output
def get_aligned(self, field, as_string=False): def get_aligned(self, field, as_string=False):
"""Return an aligned array for a token attribute.""" """Return an aligned array for a token attribute."""
align = self.alignment.x2y align = self.alignment.x2y
@ -337,7 +333,7 @@ cdef class Example:
missing=None missing=None
) )
# Now fill the tokens we can align to O. # Now fill the tokens we can align to O.
O = 2 # I=1, O=2, B=3 O = 2 # I=1, O=2, B=3 # no-cython-lint: E741
for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")): for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
if x_tags[i] is None: if x_tags[i] is None:
if ent_iob == O: if ent_iob == O:
@ -347,7 +343,7 @@ cdef class Example:
return x_ents, x_tags return x_ents, x_tags
def get_aligned_ner(self): def get_aligned_ner(self):
x_ents, x_tags = self.get_aligned_ents_and_ner() _x_ents, x_tags = self.get_aligned_ents_and_ner()
return x_tags return x_tags
def get_matching_ents(self, check_label=True): def get_matching_ents(self, check_label=True):
@ -405,7 +401,6 @@ cdef class Example:
return span_dict return span_dict
def _links_to_dict(self): def _links_to_dict(self):
links = {} links = {}
for ent in self.reference.ents: for ent in self.reference.ents:
@ -596,6 +591,7 @@ def _fix_legacy_dict_data(example_dict):
"doc_annotation": doc_dict "doc_annotation": doc_dict
} }
def _has_field(annot, field): def _has_field(annot, field):
if field not in annot: if field not in annot:
return False return False
@ -632,6 +628,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
ent_types.append("") ent_types.append("")
return ent_iobs, ent_types return ent_iobs, ent_types
def _parse_links(vocab, words, spaces, links): def _parse_links(vocab, words, spaces, links):
reference = Doc(vocab, words=words, spaces=spaces) reference = Doc(vocab, words=words, spaces=spaces)
starts = {token.idx: token.i for token in reference} starts = {token.idx: token.i for token in reference}

View File

@ -1,4 +1,3 @@
import json
import warnings import warnings
import srsly import srsly
@ -6,7 +5,7 @@ import srsly
from .. import util from .. import util
from ..errors import Warnings from ..errors import Warnings
from ..tokens import Doc from ..tokens import Doc
from .iob_utils import offsets_to_biluo_tags, tags_to_entities from .iob_utils import offsets_to_biluo_tags
def docs_to_json(docs, doc_id=0, ner_missing_tag="O"): def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
@ -23,7 +22,13 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
json_doc = {"id": doc_id, "paragraphs": []} json_doc = {"id": doc_id, "paragraphs": []}
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
raw = None if doc.has_unknown_spaces else doc.text raw = None if doc.has_unknown_spaces else doc.text
json_para = {'raw': raw, "sentences": [], "cats": [], "entities": [], "links": []} json_para = {
'raw': raw,
"sentences": [],
"cats": [],
"entities": [],
"links": []
}
for cat, val in doc.cats.items(): for cat, val in doc.cats.items():
json_cat = {"label": cat, "value": val} json_cat = {"label": cat, "value": val}
json_para["cats"].append(json_cat) json_para["cats"].append(json_cat)
@ -35,13 +40,17 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
if ent.kb_id_: if ent.kb_id_:
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}} link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
json_para["links"].append(link_dict) json_para["links"].append(link_dict)
biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag) biluo_tags = offsets_to_biluo_tags(
doc, json_para["entities"], missing=ner_missing_tag
)
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB") attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
include_annotation = {attr: doc.has_annotation(attr) for attr in attrs} include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
for j, sent in enumerate(doc.sents): for j, sent in enumerate(doc.sents):
json_sent = {"tokens": [], "brackets": []} json_sent = {"tokens": [], "brackets": []}
for token in sent: for token in sent:
json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_} json_token = {
"id": token.i, "orth": token.text, "space": token.whitespace_
}
if include_annotation["TAG"]: if include_annotation["TAG"]:
json_token["tag"] = token.tag_ json_token["tag"] = token.tag_
if include_annotation["POS"]: if include_annotation["POS"]:
@ -125,9 +134,14 @@ def json_to_annotations(doc):
else: else:
sent_starts.append(-1) sent_starts.append(-1)
if "brackets" in sent: if "brackets" in sent:
brackets.extend((b["first"] + sent_start_i, brackets.extend(
b["last"] + sent_start_i, b["label"]) (
for b in sent["brackets"]) b["first"] + sent_start_i,
b["last"] + sent_start_i,
b["label"]
)
for b in sent["brackets"]
)
example["token_annotation"] = dict( example["token_annotation"] = dict(
ids=ids, ids=ids,
@ -160,6 +174,7 @@ def json_to_annotations(doc):
) )
yield example yield example
def json_iterate(bytes utf8_str): def json_iterate(bytes utf8_str):
# We should've made these files jsonl...But since we didn't, parse out # We should've made these files jsonl...But since we didn't, parse out
# the docs one-by-one to reduce memory usage. # the docs one-by-one to reduce memory usage.

View File

@ -71,7 +71,8 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
with nlp.select_pipes(enable=resume_components): with nlp.select_pipes(enable=resume_components):
logger.info("Resuming training for: %s", resume_components) logger.info("Resuming training for: %s", resume_components)
nlp.resume_training(sgd=optimizer) nlp.resume_training(sgd=optimizer)
# Make sure that listeners are defined before initializing further # Make sure that internal component names are synced and listeners are
# defined before initializing further
nlp._link_components() nlp._link_components()
with nlp.select_pipes(disable=[*frozen_components, *resume_components]): with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
if T["max_epochs"] == -1: if T["max_epochs"] == -1:
@ -305,9 +306,14 @@ def convert_vectors(
truncate: int, truncate: int,
prune: int, prune: int,
mode: str = VectorsMode.default, mode: str = VectorsMode.default,
attr: str = "ORTH",
) -> None: ) -> None:
vectors_loc = ensure_path(vectors_loc) vectors_loc = ensure_path(vectors_loc)
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
if attr != "ORTH":
raise ValueError(
"ORTH is the only attribute supported for vectors in .npz format."
)
nlp.vocab.vectors = Vectors( nlp.vocab.vectors = Vectors(
strings=nlp.vocab.strings, data=numpy.load(vectors_loc.open("rb")) strings=nlp.vocab.strings, data=numpy.load(vectors_loc.open("rb"))
) )
@ -335,11 +341,15 @@ def convert_vectors(
nlp.vocab.vectors = Vectors( nlp.vocab.vectors = Vectors(
strings=nlp.vocab.strings, strings=nlp.vocab.strings,
data=vectors_data, data=vectors_data,
attr=attr,
**floret_settings, **floret_settings,
) )
else: else:
nlp.vocab.vectors = Vectors( nlp.vocab.vectors = Vectors(
strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys strings=nlp.vocab.strings,
data=vectors_data,
keys=vector_keys,
attr=attr,
) )
nlp.vocab.deduplicate_vectors() nlp.vocab.deduplicate_vectors()
if prune >= 1 and mode != VectorsMode.floret: if prune >= 1 and mode != VectorsMode.floret:

View File

@ -518,7 +518,7 @@ def load_model_from_path(
if not meta: if not meta:
meta = get_model_meta(model_path) meta = get_model_meta(model_path)
config_path = model_path / "config.cfg" config_path = model_path / "config.cfg"
overrides = dict_to_dot(config) overrides = dict_to_dot(config, for_overrides=True)
config = load_config(config_path, overrides=overrides) config = load_config(config_path, overrides=overrides)
nlp = load_model_from_config( nlp = load_model_from_config(
config, config,
@ -1486,14 +1486,19 @@ def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
return result return result
def dict_to_dot(obj: Dict[str, dict]) -> Dict[str, Any]: def dict_to_dot(obj: Dict[str, dict], *, for_overrides: bool = False) -> Dict[str, Any]:
"""Convert dot notation to a dict. For example: {"token": {"pos": True, """Convert dot notation to a dict. For example: {"token": {"pos": True,
"_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}. "_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}.
values (Dict[str, dict]): The dict to convert. obj (Dict[str, dict]): The dict to convert.
for_overrides (bool): Whether to enable special handling for registered
functions in overrides.
RETURNS (Dict[str, Any]): The key/value pairs. RETURNS (Dict[str, Any]): The key/value pairs.
""" """
return {".".join(key): value for key, value in walk_dict(obj)} return {
".".join(key): value
for key, value in walk_dict(obj, for_overrides=for_overrides)
}
def dot_to_object(config: Config, section: str): def dot_to_object(config: Config, section: str):
@ -1535,13 +1540,20 @@ def set_dot_to_object(config: Config, section: str, value: Any) -> None:
def walk_dict( def walk_dict(
node: Dict[str, Any], parent: List[str] = [] node: Dict[str, Any], parent: List[str] = [], *, for_overrides: bool = False
) -> Iterator[Tuple[List[str], Any]]: ) -> Iterator[Tuple[List[str], Any]]:
"""Walk a dict and yield the path and values of the leaves.""" """Walk a dict and yield the path and values of the leaves.
for_overrides (bool): Whether to treat registered functions that start with
@ as final values rather than dicts to traverse.
"""
for key, value in node.items(): for key, value in node.items():
key_parent = [*parent, key] key_parent = [*parent, key]
if isinstance(value, dict): if isinstance(value, dict) and (
yield from walk_dict(value, key_parent) not for_overrides
or not any(value_key.startswith("@") for value_key in value)
):
yield from walk_dict(value, key_parent, for_overrides=for_overrides)
else: else:
yield (key_parent, value) yield (key_parent, value)

View File

@ -1,10 +1,8 @@
cimport numpy as np
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from libc.stdint cimport uint32_t, uint64_t from libc.stdint cimport uint32_t, uint64_t
from libcpp.set cimport set as cppset from libcpp.set cimport set as cppset
from murmurhash.mrmr cimport hash128_x64 from murmurhash.mrmr cimport hash128_x64
import functools
import warnings import warnings
from enum import Enum from enum import Enum
from typing import cast from typing import cast
@ -15,9 +13,11 @@ from thinc.api import Ops, get_array_module, get_current_ops
from thinc.backends import get_array_ops from thinc.backends import get_array_ops
from thinc.types import Floats2d from thinc.types import Floats2d
from .attrs cimport ORTH, attr_id_t
from .strings cimport StringStore from .strings cimport StringStore
from . import util from . import util
from .attrs import IDS
from .errors import Errors, Warnings from .errors import Errors, Warnings
from .strings import get_string_id from .strings import get_string_id
@ -63,6 +63,7 @@ cdef class Vectors:
cdef readonly uint32_t hash_seed cdef readonly uint32_t hash_seed
cdef readonly unicode bow cdef readonly unicode bow
cdef readonly unicode eow cdef readonly unicode eow
cdef readonly attr_id_t attr
def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"): def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
"""Create a new vector store. """Create a new vector store.
@ -78,6 +79,8 @@ cdef class Vectors:
hash_seed (int): The floret hash seed (default: 0). hash_seed (int): The floret hash seed (default: 0).
bow (str): The floret BOW string (default: "<"). bow (str): The floret BOW string (default: "<").
eow (str): The floret EOW string (default: ">"). eow (str): The floret EOW string (default: ">").
attr (Union[int, str]): The token attribute for the vector keys
(default: "ORTH").
DOCS: https://spacy.io/api/vectors#init DOCS: https://spacy.io/api/vectors#init
""" """
@ -100,10 +103,18 @@ cdef class Vectors:
self.hash_seed = hash_seed self.hash_seed = hash_seed
self.bow = bow self.bow = bow
self.eow = eow self.eow = eow
if isinstance(attr, (int, long)):
self.attr = attr
else:
attr = attr.upper()
if attr == "TEXT":
attr = "ORTH"
self.attr = IDS.get(attr, ORTH)
if self.mode == Mode.default: if self.mode == Mode.default:
if data is None: if data is None:
if shape is None: if shape is None:
shape = (0,0) shape = (0, 0)
ops = get_current_ops() ops = get_current_ops()
data = ops.xp.zeros(shape, dtype="f") data = ops.xp.zeros(shape, dtype="f")
self._unset = cppset[int]({i for i in range(data.shape[0])}) self._unset = cppset[int]({i for i in range(data.shape[0])})
@ -246,8 +257,7 @@ cdef class Vectors:
return ( return (
self.shape == other.shape self.shape == other.shape
and self.key2row == other.key2row and self.key2row == other.key2row
and self.to_bytes(exclude=["strings"]) and self.to_bytes(exclude=["strings"]) == other.to_bytes(exclude=["strings"])
== other.to_bytes(exclude=["strings"])
) )
def resize(self, shape, inplace=False): def resize(self, shape, inplace=False):
@ -504,11 +514,12 @@ cdef class Vectors:
# vectors e.g. (10000, 300) # vectors e.g. (10000, 300)
# sims e.g. (1024, 10000) # sims e.g. (1024, 10000)
sims = xp.dot(batch, vectors.T) sims = xp.dot(batch, vectors.T)
best_rows[i:i+batch_size] = xp.argpartition(sims, -n, axis=1)[:,-n:] best_rows[i:i+batch_size] = xp.argpartition(sims, -n, axis=1)[:, -n:]
scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:,-n:] scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:, -n:]
if sort and n >= 2: if sort and n >= 2:
sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1] sorted_index = xp.arange(scores.shape[0])[:, None][i:i+batch_size], \
xp.argsort(scores[i:i+batch_size], axis=1)[:, ::-1]
scores[i:i+batch_size] = scores[sorted_index] scores[i:i+batch_size] = scores[sorted_index]
best_rows[i:i+batch_size] = best_rows[sorted_index] best_rows[i:i+batch_size] = best_rows[sorted_index]
@ -522,8 +533,12 @@ cdef class Vectors:
numpy_rows = get_current_ops().to_numpy(best_rows) numpy_rows = get_current_ops().to_numpy(best_rows)
keys = xp.asarray( keys = xp.asarray(
[[row2key[row] for row in numpy_rows[i] if row in row2key] [
for i in range(len(queries)) ], dtype="uint64") [row2key[row] for row in numpy_rows[i] if row in row2key]
for i in range(len(queries))
],
dtype="uint64"
)
return (keys, best_rows, scores) return (keys, best_rows, scores)
def to_ops(self, ops: Ops): def to_ops(self, ops: Ops):
@ -543,6 +558,7 @@ cdef class Vectors:
"hash_seed": self.hash_seed, "hash_seed": self.hash_seed,
"bow": self.bow, "bow": self.bow,
"eow": self.eow, "eow": self.eow,
"attr": self.attr,
} }
def _set_cfg(self, cfg): def _set_cfg(self, cfg):
@ -553,6 +569,7 @@ cdef class Vectors:
self.hash_seed = cfg.get("hash_seed", 0) self.hash_seed = cfg.get("hash_seed", 0)
self.bow = cfg.get("bow", "<") self.bow = cfg.get("bow", "<")
self.eow = cfg.get("eow", ">") self.eow = cfg.get("eow", ">")
self.attr = cfg.get("attr", ORTH)
def to_disk(self, path, *, exclude=tuple()): def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory. """Save the current state to a directory.
@ -564,9 +581,9 @@ cdef class Vectors:
""" """
xp = get_array_module(self.data) xp = get_array_module(self.data)
if xp is numpy: if xp is numpy:
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) # no-cython-lint
else: else:
save_array = lambda arr, file_: xp.save(file_, arr) save_array = lambda arr, file_: xp.save(file_, arr) # no-cython-lint
def save_vectors(path): def save_vectors(path):
# the source of numpy.save indicates that the file object is closed after use. # the source of numpy.save indicates that the file object is closed after use.

View File

@ -1,6 +1,4 @@
# cython: profile=True # cython: profile=True
from libc.string cimport memcpy
import functools import functools
import numpy import numpy
@ -19,7 +17,6 @@ from .errors import Errors
from .lang.lex_attrs import LEX_ATTRS, get_lang, is_stop from .lang.lex_attrs import LEX_ATTRS, get_lang, is_stop
from .lang.norm_exceptions import BASE_NORMS from .lang.norm_exceptions import BASE_NORMS
from .lookups import Lookups from .lookups import Lookups
from .util import registry
from .vectors import Mode as VectorsMode from .vectors import Mode as VectorsMode
from .vectors import Vectors from .vectors import Vectors
@ -50,8 +47,15 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab DOCS: https://spacy.io/api/vocab
""" """
def __init__(self, lex_attr_getters=None, strings=None, lookups=None, def __init__(
oov_prob=-20., writing_system=None, get_noun_chunks=None): self,
lex_attr_getters=None,
strings=None,
lookups=None,
oov_prob=-20.,
writing_system=None,
get_noun_chunks=None
):
"""Create the vocabulary. """Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to lex_attr_getters (dict): A dictionary mapping attribute IDs to
@ -150,7 +154,6 @@ cdef class Vocab:
cdef LexemeC* lex cdef LexemeC* lex
cdef hash_t key = self.strings[string] cdef hash_t key = self.strings[string]
lex = <LexemeC*>self._by_orth.get(key) lex = <LexemeC*>self._by_orth.get(key)
cdef size_t addr
if lex != NULL: if lex != NULL:
assert lex.orth in self.strings assert lex.orth in self.strings
if lex.orth != key: if lex.orth != key:
@ -352,8 +355,13 @@ cdef class Vocab:
self[orth] self[orth]
# Make prob negative so it sorts by rank ascending # Make prob negative so it sorts by rank ascending
# (key2row contains the rank) # (key2row contains the rank)
priority = [(-lex.prob, self.vectors.key2row[lex.orth], lex.orth) priority = []
for lex in self if lex.orth in self.vectors.key2row] cdef Lexeme lex
cdef attr_t value
for lex in self:
value = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
if value in self.vectors.key2row:
priority.append((-lex.prob, self.vectors.key2row[value], value))
priority.sort() priority.sort()
indices = xp.asarray([i for (prob, i, key) in priority], dtype="uint64") indices = xp.asarray([i for (prob, i, key) in priority], dtype="uint64")
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64") keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
@ -386,8 +394,10 @@ cdef class Vocab:
""" """
if isinstance(orth, str): if isinstance(orth, str):
orth = self.strings.add(orth) orth = self.strings.add(orth)
if self.has_vector(orth): cdef Lexeme lex = self[orth]
return self.vectors[orth] key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
if self.has_vector(key):
return self.vectors[key]
xp = get_array_module(self.vectors.data) xp = get_array_module(self.vectors.data)
vectors = xp.zeros((self.vectors_length,), dtype="f") vectors = xp.zeros((self.vectors_length,), dtype="f")
return vectors return vectors
@ -403,15 +413,16 @@ cdef class Vocab:
""" """
if isinstance(orth, str): if isinstance(orth, str):
orth = self.strings.add(orth) orth = self.strings.add(orth)
if self.vectors.is_full and orth not in self.vectors: cdef Lexeme lex = self[orth]
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
if self.vectors.is_full and key not in self.vectors:
new_rows = max(100, int(self.vectors.shape[0]*1.3)) new_rows = max(100, int(self.vectors.shape[0]*1.3))
if self.vectors.shape[1] == 0: if self.vectors.shape[1] == 0:
width = vector.size width = vector.size
else: else:
width = self.vectors.shape[1] width = self.vectors.shape[1]
self.vectors.resize((new_rows, width)) self.vectors.resize((new_rows, width))
lex = self[orth] # Add word to vocab if necessary row = self.vectors.add(key, vector=vector)
row = self.vectors.add(orth, vector=vector)
if row >= 0: if row >= 0:
lex.rank = row lex.rank = row
@ -426,7 +437,9 @@ cdef class Vocab:
""" """
if isinstance(orth, str): if isinstance(orth, str):
orth = self.strings.add(orth) orth = self.strings.add(orth)
return orth in self.vectors cdef Lexeme lex = self[orth]
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
return key in self.vectors
property lookups: property lookups:
def __get__(self): def __get__(self):
@ -440,7 +453,6 @@ cdef class Vocab:
self.lookups.get_table("lexeme_norm"), self.lookups.get_table("lexeme_norm"),
) )
def to_disk(self, path, *, exclude=tuple()): def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory. """Save the current state to a directory.
@ -453,7 +465,6 @@ cdef class Vocab:
path = util.ensure_path(path) path = util.ensure_path(path)
if not path.exists(): if not path.exists():
path.mkdir() path.mkdir()
setters = ["strings", "vectors"]
if "strings" not in exclude: if "strings" not in exclude:
self.strings.to_disk(path / "strings.json") self.strings.to_disk(path / "strings.json")
if "vectors" not in exclude: if "vectors" not in exclude:
@ -472,7 +483,6 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#to_disk DOCS: https://spacy.io/api/vocab#to_disk
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
getters = ["strings", "vectors"]
if "strings" not in exclude: if "strings" not in exclude:
self.strings.from_disk(path / "strings.json") # TODO: add exclude? self.strings.from_disk(path / "strings.json") # TODO: add exclude?
if "vectors" not in exclude: if "vectors" not in exclude:

View File

@ -303,7 +303,7 @@ mapped to a zero vector. See the documentation on
| `nM` | The width of the static vectors. ~~Optional[int]~~ | | `nM` | The width of the static vectors. ~~Optional[int]~~ |
| `dropout` | Optional dropout rate. If set, it's applied per dimension over the whole batch. Defaults to `None`. ~~Optional[float]~~ | | `dropout` | Optional dropout rate. If set, it's applied per dimension over the whole batch. Defaults to `None`. ~~Optional[float]~~ |
| `init_W` | The [initialization function](https://thinc.ai/docs/api-initializers). Defaults to [`glorot_uniform_init`](https://thinc.ai/docs/api-initializers#glorot_uniform_init). ~~Callable[[Ops, Tuple[int, ...]]], FloatsXd]~~ | | `init_W` | The [initialization function](https://thinc.ai/docs/api-initializers). Defaults to [`glorot_uniform_init`](https://thinc.ai/docs/api-initializers#glorot_uniform_init). ~~Callable[[Ops, Tuple[int, ...]]], FloatsXd]~~ |
| `key_attr` | Defaults to `"ORTH"`. ~~str~~ | | `key_attr` | This setting is ignored in spaCy v3.6+. To set a custom key attribute for vectors, configure it through [`Vectors`](/api/vectors) or [`spacy init vectors`](/api/cli#init-vectors). Defaults to `"ORTH"`. ~~str~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ |
### spacy.FeatureExtractor.v1 {id="FeatureExtractor"} ### spacy.FeatureExtractor.v1 {id="FeatureExtractor"}

View File

@ -876,7 +876,7 @@ token-to-vector embedding component like [`Tok2Vec`](/api/tok2vec) or
training a pipeline with components sourced from an existing pipeline: if training a pipeline with components sourced from an existing pipeline: if
multiple components (e.g. tagger, parser, NER) listen to the same multiple components (e.g. tagger, parser, NER) listen to the same
token-to-vector component, but some of them are frozen and not updated, their token-to-vector component, but some of them are frozen and not updated, their
performance may degrade significally as the token-to-vector component is updated performance may degrade significantly as the token-to-vector component is updated
with new data. To prevent this, listeners can be replaced with a standalone with new data. To prevent this, listeners can be replaced with a standalone
token-to-vector layer that is owned by the component and doesn't change if the token-to-vector layer that is owned by the component and doesn't change if the
component isn't updated. component isn't updated.

View File

@ -60,7 +60,7 @@ architectures and their arguments and hyperparameters.
| `model` | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~ | | `model` | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~ |
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | | `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
| `threshold` | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~ | | `threshold` | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~ |
| `max_length` | Maximum length of the produced spans, defaults to `None` meaning unlimited length. ~~Optional[int]~~ | | `max_length` | Maximum length of the produced spans, defaults to `25`. ~~Optional[int]~~ |
| `min_length` | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~ | | `min_length` | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~ |
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | | `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |

View File

@ -59,6 +59,7 @@ modified later.
| `hash_seed` <Tag variant="new">3.2</Tag> | The floret hash seed (default: `0`). ~~int~~ | | `hash_seed` <Tag variant="new">3.2</Tag> | The floret hash seed (default: `0`). ~~int~~ |
| `bow` <Tag variant="new">3.2</Tag> | The floret BOW string (default: `"<"`). ~~str~~ | | `bow` <Tag variant="new">3.2</Tag> | The floret BOW string (default: `"<"`). ~~str~~ |
| `eow` <Tag variant="new">3.2</Tag> | The floret EOW string (default: `">"`). ~~str~~ | | `eow` <Tag variant="new">3.2</Tag> | The floret EOW string (default: `">"`). ~~str~~ |
| `attr` <Tag variant="new">3.6</Tag> | The token attribute for the vector keys (default: `"ORTH"`). ~~Union[int, str]~~ |
## Vectors.\_\_getitem\_\_ {id="getitem",tag="method"} ## Vectors.\_\_getitem\_\_ {id="getitem",tag="method"}
@ -453,7 +454,8 @@ Load state from a binary string.
## Attributes {id="attributes"} ## Attributes {id="attributes"}
| Name | Description | | Name | Description |
| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `data` | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ | | `data` | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
| `key2row` | Dictionary mapping word hashes to rows in the `Vectors.data` table. ~~Dict[int, int]~~ | | `key2row` | Dictionary mapping word hashes to rows in the `Vectors.data` table. ~~Dict[int, int]~~ |
| `keys` | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ | | `keys` | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
| `attr` <Tag variant="new">3.6</Tag> | The token attribute for the vector keys. ~~int~~ |

View File

@ -113,7 +113,7 @@ print(doc[2].morph) # 'Case=Nom|Person=2|PronType=Prs'
print(doc[2].pos_) # 'PRON' print(doc[2].pos_) # 'PRON'
``` ```
## Lemmatization {id="lemmatization",model="lemmatizer",version="3"} ## Lemmatization {id="lemmatization",version="3"}
spaCy provides two pipeline components for lemmatization: spaCy provides two pipeline components for lemmatization:
@ -170,7 +170,7 @@ nlp = spacy.blank("sv")
nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
``` ```
### Rule-based lemmatizer {id="lemmatizer-rule"} ### Rule-based lemmatizer {id="lemmatizer-rule",model="morphologizer"}
When training pipelines that include a component that assigns part-of-speech When training pipelines that include a component that assigns part-of-speech
tags (a morphologizer or a tagger with a [POS mapping](#mappings-exceptions)), a tags (a morphologizer or a tagger with a [POS mapping](#mappings-exceptions)), a
@ -194,7 +194,7 @@ information, without consulting the context of the token. The rule-based
lemmatizer also accepts list-based exception files. For English, these are lemmatizer also accepts list-based exception files. For English, these are
acquired from [WordNet](https://wordnet.princeton.edu/). acquired from [WordNet](https://wordnet.princeton.edu/).
### Trainable lemmatizer ### Trainable lemmatizer {id="lemmatizer-train",model="trainable_lemmatizer"}
The [`EditTreeLemmatizer`](/api/edittreelemmatizer) can learn form-to-lemma The [`EditTreeLemmatizer`](/api/edittreelemmatizer) can learn form-to-lemma
transformations from a training corpus that includes lemma annotations. This transformations from a training corpus that includes lemma annotations. This

View File

@ -11,7 +11,6 @@ menu:
- ['Custom Functions', 'custom-functions'] - ['Custom Functions', 'custom-functions']
- ['Initialization', 'initialization'] - ['Initialization', 'initialization']
- ['Data Utilities', 'data'] - ['Data Utilities', 'data']
- ['Parallel Training', 'parallel-training']
- ['Internal API', 'api'] - ['Internal API', 'api']
--- ---
@ -1565,77 +1564,6 @@ token-based annotations like the dependency parse or entity labels, you'll need
to take care to adjust the `Example` object so its annotations match and remain to take care to adjust the `Example` object so its annotations match and remain
valid. valid.
## Parallel & distributed training with Ray {id="parallel-training"}
> #### Installation
>
> ```bash
> $ pip install -U %%SPACY_PKG_NAME[ray]%%SPACY_PKG_FLAGS
> # Check that the CLI is registered
> $ python -m spacy ray --help
> ```
[Ray](https://ray.io/) is a fast and simple framework for building and running
**distributed applications**. You can use Ray to train spaCy on one or more
remote machines, potentially speeding up your training process. Parallel
training won't always be faster though it depends on your batch size, models,
and hardware.
<Infobox variant="warning">
To use Ray with spaCy, you need the
[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
Installing the package will automatically add the `ray` command to the spaCy
CLI.
</Infobox>
The [`spacy ray train`](/api/cli#ray-train) command follows the same API as
[`spacy train`](/api/cli#train), with a few extra options to configure the Ray
setup. You can optionally set the `--address` option to point to your Ray
cluster. If it's not set, Ray will run locally.
```bash
python -m spacy ray train config.cfg --n-workers 2
```
<Project id="integrations/ray">
Get started with parallel training using our project template. It trains a
simple model on a Universal Dependencies Treebank and lets you parallelize the
training with Ray.
</Project>
### How parallel training works {id="parallel-training-details"}
Each worker receives a shard of the **data** and builds a copy of the **model
and optimizer** from the [`config.cfg`](#config). It also has a communication
channel to **pass gradients and parameters** to the other workers. Additionally,
each worker is given ownership of a subset of the parameter arrays. Every
parameter array is owned by exactly one worker, and the workers are given a
mapping so they know which worker owns which parameter.
![Illustration of setup](/images/spacy-ray.svg)
As training proceeds, every worker will be computing gradients for **all** of
the model parameters. When they compute gradients for parameters they don't own,
they'll **send them to the worker** that does own that parameter, along with a
version identifier so that the owner can decide whether to discard the gradient.
Workers use the gradients they receive and the ones they compute locally to
update the parameters they own, and then broadcast the updated array and a new
version ID to the other workers.
This training procedure is **asynchronous** and **non-blocking**. Workers always
push their gradient increments and parameter updates, they do not have to pull
them and block on the result, so the transfers can happen in the background,
overlapped with the actual training work. The workers also do not have to stop
and wait for each other ("synchronize") at the start of each batch. This is very
useful for spaCy, because spaCy is often trained on long documents, which means
**batches can vary in size** significantly. Uneven workloads make synchronous
gradient descent inefficient, because if one batch is slow, all of the other
workers are stuck waiting for it to complete before they can continue.
## Internal training API {id="api"} ## Internal training API {id="api"}
<Infobox variant="danger"> <Infobox variant="danger">

143
website/docs/usage/v3-6.mdx Normal file
View File

@ -0,0 +1,143 @@
---
title: What's New in v3.6
teaser: New features and how to upgrade
menu:
- ['New Features', 'features']
- ['Upgrading Notes', 'upgrading']
---
## New features {id="features",hidden="true"}
spaCy v3.6 adds the new [`SpanFinder`](/api/spanfinder) component to the core
spaCy library and new trained pipelines for Slovenian.
### SpanFinder {id="spanfinder"}
The [`SpanFinder`](/api/spanfinder) component identifies potentially
overlapping, unlabeled spans by identifying span start and end tokens. It is
intended for use in combination with a component like
[`SpanCategorizer`](/api/spancategorizer) that may further filter or label the
spans. See our
[Spancat blog post](https://explosion.ai/blog/spancat#span-finder) for a more
detailed introduction to the span finder.
To train a pipeline with `span_finder` + `spancat`, remember to add
`span_finder` (and its `tok2vec` or `transformer` if required) to
`[training.annotating_components]` so that the `spancat` component can be
trained directly from its predictions:
```ini
[nlp]
pipeline = ["tok2vec","span_finder","spancat"]
[training]
annotating_components = ["tok2vec","span_finder"]
```
In practice it can be helpful to initially train the `span_finder` separately
before [sourcing](/usage/processing-pipelines#sourced-components) it (along with
its `tok2vec`) into the `spancat` pipeline for further training. Otherwise the
memory usage can spike for `spancat` in the first few training steps if the
`span_finder` makes a large number of predictions.
### Additional features and improvements {id="additional-features-and-improvements"}
- Language updates:
- Add initial support for Malay.
- Update Latin defaults to support noun chunks, update lexical/tokenizer
settings and add example sentences.
- Support `spancat_singlelabel` in `spacy debug data` CLI.
- Add `doc.spans` rendering to `spacy evaluate` CLI displaCy output.
- Support custom token/lexeme attribute for vectors.
- Add option to return scores separately keyed by component name with
`spacy evaluate --per-component`, `Language.evaluate(per_component=True)` and
`Scorer.score(per_component=True)`. This is useful when the pipeline contains
more than one of the same component like `textcat` that may have overlapping
scores keys.
- Typing updates for `PhraseMatcher` and `SpanGroup`.
## Trained pipelines {id="pipelines"}
### New trained pipelines {id="new-pipelines"}
v3.6 introduces new pipelines for Slovenian, which use the trainable lemmatizer
and [floret vectors](https://github.com/explosion/floret).
| Package | UPOS | Parser LAS | NER F |
| ------------------------------------------------- | ---: | ---------: | ----: |
| [`sl_core_news_sm`](/models/sl#sl_core_news_sm) | 96.9 | 82.1 | 62.9 |
| [`sl_core_news_md`](/models/sl#sl_core_news_md) | 97.6 | 84.3 | 73.5 |
| [`sl_core_news_lg`](/models/sl#sl_core_news_lg) | 97.7 | 84.3 | 79.0 |
| [`sl_core_news_trf`](/models/sl#sl_core_news_trf) | 99.0 | 91.7 | 90.0 |
### Pipeline updates {id="pipeline-updates"}
The English pipelines have been updated to improve handling of contractions with
various apostrophes and to lemmatize "get" as a passive auxiliary.
The Danish pipeline `da_core_news_trf` has been updated to use
[`vesteinn/DanskBERT`](https://huggingface.co/vesteinn/DanskBERT) with
performance improvements across the board.
## Notes about upgrading from v3.5 {id="upgrading"}
### SpanGroup spans are now required to be from the same doc {id="spangroup-spans"}
When initializing a `SpanGroup`, there is a new check to verify that all added
spans refer to the current doc. Without this check, it was possible to run into
string store or other errors.
One place this may crop up is when creating `Example` objects for training with
custom spans:
```diff
doc = Doc(nlp.vocab, words=tokens) # predicted doc
example = Example.from_dict(doc, {"ner": iob_tags})
# use the reference doc when creating reference spans
- span = Span(doc, 0, 5, "ORG")
+ span = Span(example.reference, 0, 5, "ORG")
example.reference.spans[spans_key] = [span]
```
### Pipeline package version compatibility {id="version-compat"}
> #### Using legacy implementations
>
> In spaCy v3, you'll still be able to load and reference legacy implementations
> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
> components or architectures change and newer versions are available in the
> core library.
When you're loading a pipeline package trained with an earlier version of spaCy
v3, you will see a warning telling you that the pipeline may be incompatible.
This doesn't necessarily have to be true, but we recommend running your
pipelines against your test suite or evaluation data to make sure there are no
unexpected results.
If you're using one of the [trained pipelines](/models) we provide, you should
run [`spacy download`](/api/cli#download) to update to the latest version. To
see an overview of all installed packages and their compatibility, you can run
[`spacy validate`](/api/cli#validate).
If you've trained your own custom pipeline and you've confirmed that it's still
working as expected, you can update the spaCy version requirements in the
[`meta.json`](/api/data-formats#meta):
```diff
- "spacy_version": ">=3.5.0,<3.6.0",
+ "spacy_version": ">=3.5.0,<3.7.0",
```
### Updating v3.5 configs
To update a config from spaCy v3.5 with the new v3.6 settings, run
[`init fill-config`](/api/cli#init-fill-config):
```cli
$ python -m spacy init fill-config config-v3.5.cfg config-v3.6.cfg
```
In many cases ([`spacy train`](/api/cli#train),
[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
automatically, but you'll need to fill in the new settings to run
[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).

View File

@ -222,7 +222,9 @@
}, },
{ {
"code": "la", "code": "la",
"name": "Latin" "name": "Latin",
"example": "In principio creavit Deus caelum et terram.",
"has_examples": true
}, },
{ {
"code": "lb", "code": "lb",
@ -339,7 +341,10 @@
}, },
{ {
"code": "sl", "code": "sl",
"name": "Slovenian" "name": "Slovenian",
"example": "France Prešeren je umrl 8. februarja 1849 v Kranju",
"has_examples": true,
"models": ["sl_core_news_sm", "sl_core_news_md", "sl_core_news_lg", "sl_core_news_trf"]
}, },
{ {
"code": "sq", "code": "sq",

View File

@ -14,7 +14,8 @@
{ "text": "New in v3.2", "url": "/usage/v3-2" }, { "text": "New in v3.2", "url": "/usage/v3-2" },
{ "text": "New in v3.3", "url": "/usage/v3-3" }, { "text": "New in v3.3", "url": "/usage/v3-3" },
{ "text": "New in v3.4", "url": "/usage/v3-4" }, { "text": "New in v3.4", "url": "/usage/v3-4" },
{ "text": "New in v3.5", "url": "/usage/v3-5" } { "text": "New in v3.5", "url": "/usage/v3-5" },
{ "text": "New in v3.6", "url": "/usage/v3-6" }
] ]
}, },
{ {

View File

@ -27,7 +27,7 @@
"indexName": "spacy" "indexName": "spacy"
}, },
"binderUrl": "explosion/spacy-io-binder", "binderUrl": "explosion/spacy-io-binder",
"binderVersion": "3.5", "binderVersion": "3.6",
"sections": [ "sections": [
{ "id": "usage", "title": "Usage Documentation", "theme": "blue" }, { "id": "usage", "title": "Usage Documentation", "theme": "blue" },
{ "id": "models", "title": "Models Documentation", "theme": "blue" }, { "id": "models", "title": "Models Documentation", "theme": "blue" },

View File

@ -4376,7 +4376,7 @@
"code_example": [ "code_example": [
"import spacy", "import spacy",
"", "",
"nlp = spacy.load(\"en_core_web_sm\", disable=[\"ner\"])", "nlp = spacy.load(\"en_core_web_sm\", exclude=[\"ner\"])",
"nlp.add_pipe(\"span_marker\", config={\"model\": \"tomaarsen/span-marker-roberta-large-ontonotes5\"})", "nlp.add_pipe(\"span_marker\", config={\"model\": \"tomaarsen/span-marker-roberta-large-ontonotes5\"})",
"", "",
"text = \"\"\"Cleopatra VII, also known as Cleopatra the Great, was the last active ruler of the \\", "text = \"\"\"Cleopatra VII, also known as Cleopatra the Great, was the last active ruler of the \\",

View File

@ -13,6 +13,8 @@ import 'prismjs/components/prism-json.min.js'
import 'prismjs/components/prism-markdown.min.js' import 'prismjs/components/prism-markdown.min.js'
import 'prismjs/components/prism-python.min.js' import 'prismjs/components/prism-python.min.js'
import 'prismjs/components/prism-yaml.min.js' import 'prismjs/components/prism-yaml.min.js'
import 'prismjs/components/prism-docker.min.js'
import 'prismjs/components/prism-r.min.js'
import { isString } from './util' import { isString } from './util'
import Link, { OptionalLink } from './link' import Link, { OptionalLink } from './link'
@ -172,7 +174,7 @@ const convertLine = ({ line, prompt, lang }) => {
return handlePromot({ lineFlat, prompt }) return handlePromot({ lineFlat, prompt })
} }
return lang === 'none' || !lineFlat ? ( return lang === 'none' || !lineFlat || !(lang in Prism.languages) ? (
lineFlat lineFlat
) : ( ) : (
<span <span

View File

@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
} }
const navAlert = ( const navAlert = (
<Link to="/usage/v3-5" noLinkLayout> <Link to="/usage/v3-6" noLinkLayout>
<strong>💥 Out now:</strong> spaCy v3.5 <strong>💥 Out now:</strong> spaCy v3.6
</Link> </Link>
) )