cython fixes and cleanup

This commit is contained in:
svlandeg 2023-07-19 17:41:29 +02:00
parent 846472129c
commit 96f2e30c4b
18 changed files with 118 additions and 128 deletions

View File

@ -158,7 +158,6 @@ cdef class PhraseMatcher:
del self._callbacks[key] del self._callbacks[key]
del self._docs[key] del self._docs[key]
def _add_from_arrays(self, key, specs, *, on_match=None): def _add_from_arrays(self, key, specs, *, on_match=None):
"""Add a preprocessed list of specs, with an optional callback. """Add a preprocessed list of specs, with an optional callback.
@ -194,7 +193,6 @@ cdef class PhraseMatcher:
result = internal_node result = internal_node
map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL) map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
def add(self, key, docs, *, on_match=None): def add(self, key, docs, *, on_match=None):
"""Add a match-rule to the phrase-matcher. A match-rule consists of: an ID """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
key, a list of one or more patterns, and (optionally) an on_match callback. key, a list of one or more patterns, and (optionally) an on_match callback.

View File

@ -1,5 +1,5 @@
# cython: infer_types=True, cdivision=True, boundscheck=False # cython: infer_types=True, cdivision=True, boundscheck=False
from typing import Any, List, Optional, Tuple, TypeVar, cast from typing import Any, List, Optional, Tuple, cast
from libc.stdlib cimport calloc, free, realloc from libc.stdlib cimport calloc, free, realloc
from libc.string cimport memcpy, memset from libc.string cimport memcpy, memset
@ -23,7 +23,7 @@ from thinc.api import (
from thinc.backends.cblas cimport CBlas, saxpy, sgemm from thinc.backends.cblas cimport CBlas, saxpy, sgemm
from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
from ..errors import Errors from ..errors import Errors
from ..pipeline._parser_internals import _beam_utils from ..pipeline._parser_internals import _beam_utils
@ -136,7 +136,7 @@ def init(
Y: Optional[Tuple[List[State], List[Floats2d]]] = None, Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
): ):
if X is not None: if X is not None:
docs, moves = X docs, _ = X
model.get_ref("tok2vec").initialize(X=docs) model.get_ref("tok2vec").initialize(X=docs)
else: else:
model.get_ref("tok2vec").initialize() model.get_ref("tok2vec").initialize()
@ -145,7 +145,7 @@ def init(
current_nO = model.maybe_get_dim("nO") current_nO = model.maybe_get_dim("nO")
if current_nO is None or current_nO != inferred_nO: if current_nO is None or current_nO != inferred_nO:
model.attrs["resize_output"](model, inferred_nO) model.attrs["resize_output"](model, inferred_nO)
nO = model.get_dim("nO") # nO = model.get_dim("nO")
nP = model.get_dim("nP") nP = model.get_dim("nP")
nH = model.get_dim("nH") nH = model.get_dim("nH")
nI = model.get_dim("nI") nI = model.get_dim("nI")
@ -194,7 +194,8 @@ class TransitionModelInputs:
moves: TransitionSystem, moves: TransitionSystem,
actions: Optional[List[Ints1d]] = None, actions: Optional[List[Ints1d]] = None,
max_moves: int = 0, max_moves: int = 0,
states: Optional[List[State]]=None): states: Optional[List[State]] = None,
):
""" """
actions (Optional[List[Ints1d]]): actions to apply for each Doc. actions (Optional[List[Ints1d]]): actions to apply for each Doc.
docs (List[Doc]): Docs to predict transition sequences for. docs (List[Doc]): Docs to predict transition sequences for.
@ -257,9 +258,10 @@ def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[State
return (states, scores), backprop return (states, scores), backprop
cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states, cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None): WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
cdef int i, j cdef int i
cdef vector[StateC *] unfinished cdef vector[StateC *] unfinished
cdef ActivationsC activations = _alloc_activations(sizes) cdef ActivationsC activations = _alloc_activations(sizes)
cdef np.ndarray step_scores cdef np.ndarray step_scores
@ -371,7 +373,7 @@ def _forward_fallback(
for clas in set(model.attrs["unseen_classes"]): for clas in set(model.attrs["unseen_classes"]):
if (d_scores[:, clas] < 0).any(): if (d_scores[:, clas] < 0).any():
model.attrs["unseen_classes"].remove(clas) model.attrs["unseen_classes"].remove(clas)
d_scores *= seen_mask == False d_scores *= seen_mask == False # no-cython-lint
# Calculate the gradients for the parameters of the output layer. # Calculate the gradients for the parameters of the output layer.
# The weight gemm is (nS, nO) @ (nS, nH).T # The weight gemm is (nS, nO) @ (nS, nH).T
output.inc_grad("b", d_scores.sum(axis=0)) output.inc_grad("b", d_scores.sum(axis=0))
@ -617,12 +619,12 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
scores[i*n.classes+j] = min_ scores[i*n.classes+j] = min_
cdef void _sum_state_features(CBlas cblas, float* output, cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
const float* cached, const int* token_ids, SizesC n) nogil: const int* token_ids, SizesC n) nogil:
cdef int idx, b, f, i cdef int idx, b, f
cdef const float* feature cdef const float* feature
cdef int B = n.states cdef int B = n.states
cdef int O = n.hiddens * n.pieces cdef int O = n.hiddens * n.pieces # no-cython-lint
cdef int F = n.feats cdef int F = n.feats
cdef int T = n.tokens cdef int T = n.tokens
padding = cached + (T * F * O) padding = cached + (T * F * O)
@ -637,4 +639,3 @@ cdef void _sum_state_features(CBlas cblas, float* output,
feature = &cached[idx] feature = &cached[idx]
saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1) saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
token_ids += F token_ids += F

View File

@ -80,7 +80,6 @@ cdef class Morphology:
out.sort(key=lambda x: x[0]) out.sort(key=lambda x: x[0])
return dict(out) return dict(out)
def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str: def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
norm_feats_string = self.FEATURE_SEP.join([ norm_feats_string = self.FEATURE_SEP.join([
self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values]) self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
@ -88,7 +87,6 @@ cdef class Morphology:
]) ])
return norm_feats_string or self.EMPTY_MORPH return norm_feats_string or self.EMPTY_MORPH
cdef hash_t _add(self, features): cdef hash_t _add(self, features):
"""Insert a morphological analysis in the morphology table, if not """Insert a morphological analysis in the morphology table, if not
already present. The morphological analysis may be provided in the UD already present. The morphological analysis may be provided in the UD

View File

@ -1,5 +1,4 @@
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from libc.stdint cimport int32_t
from libcpp.memory cimport shared_ptr from libcpp.memory cimport shared_ptr
from libcpp.vector cimport vector from libcpp.vector cimport vector

View File

@ -58,7 +58,6 @@ cdef class Beam:
void* extra_args) except -1 void* extra_args) except -1
cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1 cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil: cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
self.scores[i][j] = score self.scores[i][j] = score
self.is_valid[i][j] = is_valid self.is_valid[i][j] = is_valid

View File

@ -1,11 +1,8 @@
# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True # cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
cimport cython cimport cython
from libc.math cimport exp, log
from libc.string cimport memcpy, memset
import math
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from libc.math cimport exp
from libc.string cimport memcpy, memset
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
@ -70,7 +67,7 @@ cdef class Beam:
self.costs[i][j] = costs[j] self.costs[i][j] = costs[j]
cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1: cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
cdef int i, j cdef int i
for i in range(self.width): for i in range(self.width):
memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class) memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class) memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
@ -176,7 +173,6 @@ cdef class Beam:
beam-width, and n is the number of classes. beam-width, and n is the number of classes.
""" """
cdef Entry entry cdef Entry entry
cdef weight_t score
cdef _State* s cdef _State* s
cdef int i, j, move_id cdef int i, j, move_id
assert self.size >= 1 assert self.size >= 1

View File

@ -319,4 +319,3 @@ cdef void c_transition_batch(TransitionSystem moves, StateC** states, const floa
action.do(states[i], action.label) action.do(states[i], action.label)
states[i].history.push_back(guess) states[i].history.push_back(guess)
free(is_valid) free(is_valid)

View File

@ -1,8 +1,7 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from itertools import islice from itertools import islice
from typing import Callable, Dict, Iterable, List, Optional, Union from typing import Callable, Dict, Iterable, Optional, Union
import srsly
from thinc.api import Config, Model from thinc.api import Config, Model
from thinc.legacy import LegacySequenceCategoricalCrossentropy from thinc.legacy import LegacySequenceCategoricalCrossentropy

View File

@ -1,12 +1,11 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
import warnings
from typing import Callable, Dict, Iterable, Iterator, Tuple, Union from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
import srsly import srsly
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..errors import Errors, Warnings from ..errors import Errors
from ..language import Language from ..language import Language
from ..training import Example from ..training import Example
from ..util import raise_error from ..util import raise_error

View File

@ -1,5 +1,4 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
import warnings
from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
import srsly import srsly
@ -8,7 +7,7 @@ from thinc.api import Model, Optimizer, set_dropout_rate
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from .. import util from .. import util
from ..errors import Errors, Warnings from ..errors import Errors
from ..language import Language from ..language import Language
from ..training import Example, validate_distillation_examples, validate_examples from ..training import Example, validate_distillation_examples, validate_examples
from ..vocab import Vocab from ..vocab import Vocab
@ -56,14 +55,14 @@ cdef class TrainablePipe(Pipe):
except Exception as e: except Exception as e:
error_handler(self.name, self, [doc], e) error_handler(self.name, self, [doc], e)
def distill(self, def distill(self,
teacher_pipe: Optional["TrainablePipe"], teacher_pipe: Optional["TrainablePipe"],
examples: Iterable["Example"], examples: Iterable["Example"],
*, *,
drop: float = 0.0, drop: float = 0.0,
sgd: Optional[Optimizer] = None, sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]]=None) -> Dict[str, float]: losses: Optional[Dict[str, float]] = None
) -> Dict[str, float]:
"""Train a pipe (the student) on the predictions of another pipe """Train a pipe (the student) on the predictions of another pipe
(the teacher). The student is typically trained on the probability (the teacher). The student is typically trained on the probability
distribution of the teacher, but details may differ per pipe. distribution of the teacher, but details may differ per pipe.

View File

@ -227,7 +227,8 @@ class Parser(TrainablePipe):
*, *,
drop: float = 0.0, drop: float = 0.0,
sgd: Optional[Optimizer] = None, sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]]=None): losses: Optional[Dict[str, float]] = None
):
"""Train a pipe (the student) on the predictions of another pipe """Train a pipe (the student) on the predictions of another pipe
(the teacher). The student is trained on the transition probabilities (the teacher). The student is trained on the transition probabilities
of the teacher. of the teacher.
@ -277,7 +278,9 @@ class Parser(TrainablePipe):
# teacher's distributions. # teacher's distributions.
student_inputs = TransitionModelInputs(docs=student_docs, student_inputs = TransitionModelInputs(docs=student_docs,
states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves) states=[state.copy() for state in states],
moves=self.moves,
max_moves=max_moves)
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs) (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
actions = _states_diff_to_actions(states, student_states) actions = _states_diff_to_actions(states, student_states)
teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples], teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
@ -294,7 +297,6 @@ class Parser(TrainablePipe):
return losses return losses
def get_teacher_student_loss( def get_teacher_student_loss(
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d], self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
normalize: bool = False, normalize: bool = False,
@ -436,8 +438,10 @@ class Parser(TrainablePipe):
else: else:
init_states, gold_states, _ = self.moves.init_gold_batch(examples) init_states, gold_states, _ = self.moves.init_gold_batch(examples)
inputs = TransitionModelInputs(docs=docs, moves=self.moves, inputs = TransitionModelInputs(docs=docs,
max_moves=max_moves, states=[state.copy() for state in init_states]) moves=self.moves,
max_moves=max_moves,
states=[state.copy() for state in init_states])
(pred_states, scores), backprop_scores = self.model.begin_update(inputs) (pred_states, scores), backprop_scores = self.model.begin_update(inputs)
if sum(s.shape[0] for s in scores) == 0: if sum(s.shape[0] for s in scores) == 0:
return losses return losses
@ -483,9 +487,7 @@ class Parser(TrainablePipe):
cdef TransitionSystem moves = self.moves cdef TransitionSystem moves = self.moves
cdef StateClass state cdef StateClass state
cdef int clas cdef int clas
cdef int nF = self.model.get_dim("nF")
cdef int nO = moves.n_moves cdef int nO = moves.n_moves
cdef int nS = sum([len(history) for history in histories])
cdef Pool mem = Pool() cdef Pool mem = Pool()
cdef np.ndarray costs_i cdef np.ndarray costs_i
is_valid = <int*>mem.alloc(nO, sizeof(int)) is_valid = <int*>mem.alloc(nO, sizeof(int))
@ -552,8 +554,8 @@ class Parser(TrainablePipe):
return losses return losses
def update_beam(self, examples, *, beam_width, def update_beam(self, examples, *, beam_width, drop=0.,
drop=0., sgd=None, losses=None, beam_density=0.0): sgd=None, losses=None, beam_density=0.0):
raise NotImplementedError raise NotImplementedError
def set_output(self, nO): def set_output(self, nO):
@ -678,7 +680,8 @@ class Parser(TrainablePipe):
return states return states
# Parse the states that are too long with the teacher's parsing model. # Parse the states that are too long with the teacher's parsing model.
teacher_inputs = TransitionModelInputs(docs=docs, moves=moves, teacher_inputs = TransitionModelInputs(docs=docs,
moves=moves,
states=[state.copy() for state in to_cut]) states=[state.copy() for state in to_cut])
(teacher_states, _) = teacher_pipe.model.predict(teacher_inputs) (teacher_states, _) = teacher_pipe.model.predict(teacher_inputs)
@ -778,6 +781,7 @@ def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
return actions return actions
def _states_diff_to_actions( def _states_diff_to_actions(
before_states: List[StateClass], before_states: List[StateClass],
after_states: List[StateClass] after_states: List[StateClass]
@ -798,7 +802,8 @@ def _states_diff_to_actions(
c_state_before = before_state.c c_state_before = before_state.c
c_state_after = after_state.c c_state_after = after_state.c
assert equal(c_state_before.history.begin(), c_state_before.history.end(), assert equal(c_state_before.history.begin(),
c_state_before.history.end(),
c_state_after.history.begin()) c_state_after.history.begin())
actions = [] actions = []

View File

@ -1,7 +1,6 @@
# cython: infer_types=True # cython: infer_types=True
from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Union from typing import Iterable, Iterator, List, Optional, Tuple, Union
cimport cython
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from libc.string cimport memcpy from libc.string cimport memcpy
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
@ -243,7 +242,6 @@ cdef class StringStore:
cdef int n_length_bytes cdef int n_length_bytes
cdef int i cdef int i
cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str)) cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str))
cdef uint32_t ulength = length
if length < sizeof(string.s): if length < sizeof(string.s):
string.s[0] = <unsigned char>length string.s[0] = <unsigned char>length
memcpy(&string.s[1], chars, length) memcpy(&string.s[1], chars, length)
@ -301,7 +299,7 @@ cpdef hash_t get_string_id(object string_or_hash) except -1:
try: try:
return hash_string(string_or_hash) return hash_string(string_or_hash)
except: except: # no-cython-lint
if _try_coerce_to_hash(string_or_hash, &str_hash): if _try_coerce_to_hash(string_or_hash, &str_hash):
# Coerce the integral key to the expected primitive hash type. # Coerce the integral key to the expected primitive hash type.
# This ensures that custom/overloaded "primitive" data types # This ensures that custom/overloaded "primitive" data types
@ -318,6 +316,5 @@ cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
try: try:
out_hash[0] = key out_hash[0] = key
return True return True
except: except: # no-cython-lint
return False return False

View File

@ -2,7 +2,7 @@
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
from spacy.typedefs cimport class_t, weight_t from spacy.typedefs cimport class_t
import pytest import pytest
@ -42,6 +42,7 @@ cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
state = <TestState*>state state = <TestState*>state
mem.free(state) mem.free(state)
@cytest @cytest
@pytest.mark.parametrize("nr_class,beam_width", @pytest.mark.parametrize("nr_class,beam_width",
[ [
@ -56,10 +57,12 @@ def test_init(nr_class, beam_width):
assert b.width == beam_width assert b.width == beam_width
assert b.nr_class == nr_class assert b.nr_class == nr_class
@cytest @cytest
def test_init_violn(): def test_init_violn():
MaxViolation() MaxViolation()
@cytest @cytest
@pytest.mark.parametrize("nr_class,beam_width,length", @pytest.mark.parametrize("nr_class,beam_width,length",
[ [

View File

@ -1,5 +1,4 @@
cimport numpy as np cimport numpy as np
from libc.string cimport memset
from ..errors import Errors from ..errors import Errors
from ..morphology import Morphology from ..morphology import Morphology

View File

@ -225,8 +225,8 @@ cdef class Span:
@property @property
def _(self): def _(self):
cdef SpanC* span_c = self.span_c()
"""Custom extension attributes registered via `set_extension`.""" """Custom extension attributes registered via `set_extension`."""
cdef SpanC* span_c = self.span_c()
return Underscore(Underscore.span_extensions, self, return Underscore(Underscore.span_extensions, self,
start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id) start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
@ -933,7 +933,6 @@ cdef class Span:
self.id_ = ent_id_ self.id_ = ent_id_
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
# Don't allow spaces to be the root, if there are # Don't allow spaces to be the root, if there are
# better candidates # better candidates