cython fixes and cleanup

This commit is contained in:
svlandeg 2023-07-19 17:41:29 +02:00
parent 846472129c
commit 96f2e30c4b
18 changed files with 118 additions and 128 deletions

View File

@ -158,7 +158,6 @@ cdef class PhraseMatcher:
del self._callbacks[key]
del self._docs[key]
def _add_from_arrays(self, key, specs, *, on_match=None):
"""Add a preprocessed list of specs, with an optional callback.
@ -194,7 +193,6 @@ cdef class PhraseMatcher:
result = internal_node
map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
def add(self, key, docs, *, on_match=None):
"""Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
key, a list of one or more patterns, and (optionally) an on_match callback.

View File

@ -1,5 +1,5 @@
# cython: infer_types=True, cdivision=True, boundscheck=False
from typing import Any, List, Optional, Tuple, TypeVar, cast
from typing import Any, List, Optional, Tuple, cast
from libc.stdlib cimport calloc, free, realloc
from libc.string cimport memcpy, memset
@ -23,7 +23,7 @@ from thinc.api import (
from thinc.backends.cblas cimport CBlas, saxpy, sgemm
from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
from ..errors import Errors
from ..pipeline._parser_internals import _beam_utils
@ -136,7 +136,7 @@ def init(
Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
):
if X is not None:
docs, moves = X
docs, _ = X
model.get_ref("tok2vec").initialize(X=docs)
else:
model.get_ref("tok2vec").initialize()
@ -145,7 +145,7 @@ def init(
current_nO = model.maybe_get_dim("nO")
if current_nO is None or current_nO != inferred_nO:
model.attrs["resize_output"](model, inferred_nO)
nO = model.get_dim("nO")
# nO = model.get_dim("nO")
nP = model.get_dim("nP")
nH = model.get_dim("nH")
nI = model.get_dim("nI")
@ -192,9 +192,10 @@ class TransitionModelInputs:
self,
docs: List[Doc],
moves: TransitionSystem,
actions: Optional[List[Ints1d]]=None,
max_moves: int=0,
states: Optional[List[State]]=None):
actions: Optional[List[Ints1d]] = None,
max_moves: int = 0,
states: Optional[List[State]] = None,
):
"""
actions (Optional[List[Ints1d]]): actions to apply for each Doc.
docs (List[Doc]): Docs to predict transition sequences for.
@ -234,12 +235,12 @@ def forward(model, inputs: TransitionModelInputs, is_train: bool):
return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
else:
return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
feats, backprop_feats, seen_mask, is_train, actions=actions,
max_moves=inputs.max_moves)
feats, backprop_feats, seen_mask, is_train, actions=actions,
max_moves=inputs.max_moves)
def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
np.ndarray[np.npy_bool, ndim = 1] seen_mask, actions: Optional[List[Ints1d]] = None):
cdef vector[StateC*] c_states
cdef StateClass state
for state in states:
@ -257,9 +258,10 @@ def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[State
return (states, scores), backprop
cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
cdef int i, j
cdef int i
cdef vector[StateC *] unfinished
cdef ActivationsC activations = _alloc_activations(sizes)
cdef np.ndarray step_scores
@ -276,7 +278,7 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
if actions is None:
# Validate actions, argmax, take action.
c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
sizes.states)
sizes.states)
else:
c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
for i in range(sizes.states):
@ -302,8 +304,8 @@ def _forward_fallback(
backprop_feats,
seen_mask,
is_train: bool,
actions: Optional[List[Ints1d]]=None,
max_moves: int=0):
actions: Optional[List[Ints1d]] = None,
max_moves: int = 0):
nF = model.get_dim("nF")
output = model.get_ref("output")
hidden_b = model.get_param("hidden_b")
@ -371,7 +373,7 @@ def _forward_fallback(
for clas in set(model.attrs["unseen_classes"]):
if (d_scores[:, clas] < 0).any():
model.attrs["unseen_classes"].remove(clas)
d_scores *= seen_mask == False
d_scores *= seen_mask == False # no-cython-lint
# Calculate the gradients for the parameters of the output layer.
# The weight gemm is (nS, nO) @ (nS, nH).T
output.inc_grad("b", d_scores.sum(axis=0))
@ -571,13 +573,13 @@ cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
A._max_size = n.states
else:
A.token_ids = <int*>realloc(A.token_ids,
n.states * n.feats * sizeof(A.token_ids[0]))
n.states * n.feats * sizeof(A.token_ids[0]))
A.unmaxed = <float*>realloc(A.unmaxed,
n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
A.hiddens = <float*>realloc(A.hiddens,
n.states * n.hiddens * sizeof(A.hiddens[0]))
n.states * n.hiddens * sizeof(A.hiddens[0]))
A.is_valid = <int*>realloc(A.is_valid,
n.states * n.classes * sizeof(A.is_valid[0]))
n.states * n.classes * sizeof(A.is_valid[0]))
A._max_size = n.states
A._curr_size = n.states
@ -599,9 +601,9 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
else:
# Compute hidden-to-output
sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
1.0, <const float *>A.hiddens, n.hiddens,
<const float *>W.hidden_weights, n.hiddens,
0.0, scores, n.classes)
1.0, <const float *>A.hiddens, n.hiddens,
<const float *>W.hidden_weights, n.hiddens,
0.0, scores, n.classes)
# Add bias
for i in range(n.states):
saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
@ -617,12 +619,12 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
scores[i*n.classes+j] = min_
cdef void _sum_state_features(CBlas cblas, float* output,
const float* cached, const int* token_ids, SizesC n) nogil:
cdef int idx, b, f, i
cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
const int* token_ids, SizesC n) nogil:
cdef int idx, b, f
cdef const float* feature
cdef int B = n.states
cdef int O = n.hiddens * n.pieces
cdef int O = n.hiddens * n.pieces # no-cython-lint
cdef int F = n.feats
cdef int T = n.tokens
padding = cached + (T * F * O)
@ -637,4 +639,3 @@ cdef void _sum_state_features(CBlas cblas, float* output,
feature = &cached[idx]
saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
token_ids += F

View File

@ -80,15 +80,13 @@ cdef class Morphology:
out.sort(key=lambda x: x[0])
return dict(out)
def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
norm_feats_string = self.FEATURE_SEP.join([
self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
for field, values in feats.items()
])
])
return norm_feats_string or self.EMPTY_MORPH
cdef hash_t _add(self, features):
"""Insert a morphological analysis in the morphology table, if not
already present. The morphological analysis may be provided in the UD

View File

@ -8,7 +8,7 @@ cpdef enum univ_pos_t:
ADV = symbols.ADV
AUX = symbols.AUX
CONJ = symbols.CONJ
CCONJ = symbols.CCONJ # U20
CCONJ = symbols.CCONJ # U20
DET = symbols.DET
INTJ = symbols.INTJ
NOUN = symbols.NOUN

View File

@ -1,5 +1,4 @@
from cymem.cymem cimport Pool
from libc.stdint cimport int32_t
from libcpp.memory cimport shared_ptr
from libcpp.vector cimport vector

View File

@ -57,7 +57,6 @@ cdef class Beam:
cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
void* extra_args) except -1
cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
self.scores[i][j] = score

View File

@ -1,11 +1,8 @@
# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
cimport cython
from libc.math cimport exp, log
from libc.string cimport memcpy, memset
import math
from cymem.cymem cimport Pool
from libc.math cimport exp
from libc.string cimport memcpy, memset
from preshed.maps cimport PreshMap
@ -70,7 +67,7 @@ cdef class Beam:
self.costs[i][j] = costs[j]
cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
cdef int i, j
cdef int i
for i in range(self.width):
memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
@ -176,7 +173,6 @@ cdef class Beam:
beam-width, and n is the number of classes.
"""
cdef Entry entry
cdef weight_t score
cdef _State* s
cdef int i, j, move_id
assert self.size >= 1
@ -269,7 +265,7 @@ cdef class MaxViolation:
# This can happen from non-monotonic actions
# If we find a better gold analysis this way, be sure to keep it.
elif pred._states[i].loss <= 0 \
and tuple(pred.histories[i]) not in seen_golds:
and tuple(pred.histories[i]) not in seen_golds:
g_scores.append(pred._states[i].score)
g_hist.append(list(pred.histories[i]))
for i in range(gold.size):

View File

@ -60,7 +60,7 @@ cdef class TransitionSystem:
cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
int batch_size) nogil
int batch_size) nogil
cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
int nr_class, int batch_size) nogil
int nr_class, int batch_size) nogil

View File

@ -291,19 +291,19 @@ cdef class TransitionSystem:
cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
int batch_size) nogil:
cdef int i
cdef Transition action
cdef StateC* state
for i in range(batch_size):
state = states[i]
action = moves.c[actions[i]]
action.do(state, action.label)
state.history.push_back(action.clas)
int batch_size) nogil:
cdef int i
cdef Transition action
cdef StateC* state
for i in range(batch_size):
state = states[i]
action = moves.c[actions[i]]
action.do(state, action.label)
state.history.push_back(action.clas)
cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
int nr_class, int batch_size) nogil:
int nr_class, int batch_size) nogil:
is_valid = <int*>calloc(moves.n_moves, sizeof(int))
cdef int i, guess
cdef Transition action
@ -319,4 +319,3 @@ cdef void c_transition_batch(TransitionSystem moves, StateC** states, const floa
action.do(states[i], action.label)
states[i].history.push_back(guess)
free(is_valid)

View File

@ -1,8 +1,7 @@
# cython: infer_types=True, profile=True, binding=True
from itertools import islice
from typing import Callable, Dict, Iterable, List, Optional, Union
from typing import Callable, Dict, Iterable, Optional, Union
import srsly
from thinc.api import Config, Model
from thinc.legacy import LegacySequenceCategoricalCrossentropy

View File

@ -1,12 +1,11 @@
# cython: infer_types=True, profile=True, binding=True
import warnings
from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
import srsly
from ..tokens.doc cimport Doc
from ..errors import Errors, Warnings
from ..errors import Errors
from ..language import Language
from ..training import Example
from ..util import raise_error

View File

@ -1,5 +1,4 @@
# cython: infer_types=True, profile=True, binding=True
import warnings
from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
import srsly
@ -8,7 +7,7 @@ from thinc.api import Model, Optimizer, set_dropout_rate
from ..tokens.doc cimport Doc
from .. import util
from ..errors import Errors, Warnings
from ..errors import Errors
from ..language import Language
from ..training import Example, validate_distillation_examples, validate_examples
from ..vocab import Vocab
@ -56,14 +55,14 @@ cdef class TrainablePipe(Pipe):
except Exception as e:
error_handler(self.name, self, [doc], e)
def distill(self,
teacher_pipe: Optional["TrainablePipe"],
examples: Iterable["Example"],
*,
drop: float=0.0,
sgd: Optional[Optimizer]=None,
losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
teacher_pipe: Optional["TrainablePipe"],
examples: Iterable["Example"],
*,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None
) -> Dict[str, float]:
"""Train a pipe (the student) on the predictions of another pipe
(the teacher). The student is typically trained on the probability
distribution of the teacher, but details may differ per pipe.

View File

@ -222,12 +222,13 @@ class Parser(TrainablePipe):
raise NotImplementedError
def distill(self,
teacher_pipe: Optional[TrainablePipe],
examples: Iterable["Example"],
*,
drop: float=0.0,
sgd: Optional[Optimizer]=None,
losses: Optional[Dict[str, float]]=None):
teacher_pipe: Optional[TrainablePipe],
examples: Iterable["Example"],
*,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None
):
"""Train a pipe (the student) on the predictions of another pipe
(the teacher). The student is trained on the transition probabilities
of the teacher.
@ -277,11 +278,13 @@ class Parser(TrainablePipe):
# teacher's distributions.
student_inputs = TransitionModelInputs(docs=student_docs,
states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves)
states=[state.copy() for state in states],
moves=self.moves,
max_moves=max_moves)
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
actions = _states_diff_to_actions(states, student_states)
teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
states=states, moves=teacher_pipe.moves, actions=actions)
states=states, moves=teacher_pipe.moves, actions=actions)
(_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
@ -294,10 +297,9 @@ class Parser(TrainablePipe):
return losses
def get_teacher_student_loss(
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
normalize: bool=False,
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
normalize: bool = False,
) -> Tuple[float, List[Floats2d]]:
"""Calculate the loss and its gradient for a batch of student
scores, relative to teacher scores.
@ -320,9 +322,9 @@ class Parser(TrainablePipe):
# ourselves.
teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
axis=-1, inplace=True)
axis=-1, inplace=True)
student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
axis=-1, inplace=True)
axis=-1, inplace=True)
assert teacher_scores.shape == student_scores.shape
@ -436,13 +438,15 @@ class Parser(TrainablePipe):
else:
init_states, gold_states, _ = self.moves.init_gold_batch(examples)
inputs = TransitionModelInputs(docs=docs, moves=self.moves,
max_moves=max_moves, states=[state.copy() for state in init_states])
inputs = TransitionModelInputs(docs=docs,
moves=self.moves,
max_moves=max_moves,
states=[state.copy() for state in init_states])
(pred_states, scores), backprop_scores = self.model.begin_update(inputs)
if sum(s.shape[0] for s in scores) == 0:
return losses
d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
examples, max_moves)
examples, max_moves)
backprop_scores((pred_states, d_scores))
if sgd not in (None, False):
self.finish_update(sgd)
@ -483,9 +487,7 @@ class Parser(TrainablePipe):
cdef TransitionSystem moves = self.moves
cdef StateClass state
cdef int clas
cdef int nF = self.model.get_dim("nF")
cdef int nO = moves.n_moves
cdef int nS = sum([len(history) for history in histories])
cdef Pool mem = Pool()
cdef np.ndarray costs_i
is_valid = <int*>mem.alloc(nO, sizeof(int))
@ -552,8 +554,8 @@ class Parser(TrainablePipe):
return losses
def update_beam(self, examples, *, beam_width,
drop=0., sgd=None, losses=None, beam_density=0.0):
def update_beam(self, examples, *, beam_width, drop=0.,
sgd=None, losses=None, beam_density=0.0):
raise NotImplementedError
def set_output(self, nO):
@ -678,9 +680,10 @@ class Parser(TrainablePipe):
return states
# Parse the states that are too long with the teacher's parsing model.
teacher_inputs = TransitionModelInputs(docs=docs, moves=moves,
states=[state.copy() for state in to_cut])
(teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs)
teacher_inputs = TransitionModelInputs(docs=docs,
moves=moves,
states=[state.copy() for state in to_cut])
(teacher_states, _) = teacher_pipe.model.predict(teacher_inputs)
# Step through the teacher's actions and store every state after
# each multiple of max_length.
@ -778,6 +781,7 @@ def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
return actions
def _states_diff_to_actions(
before_states: List[StateClass],
after_states: List[StateClass]
@ -798,8 +802,9 @@ def _states_diff_to_actions(
c_state_before = before_state.c
c_state_after = after_state.c
assert equal(c_state_before.history.begin(), c_state_before.history.end(),
c_state_after.history.begin())
assert equal(c_state_before.history.begin(),
c_state_before.history.end(),
c_state_after.history.begin())
actions = []
while True:

View File

@ -1,7 +1,6 @@
# cython: infer_types=True
from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Union
from typing import Iterable, Iterator, List, Optional, Tuple, Union
cimport cython
from libc.stdint cimport uint32_t
from libc.string cimport memcpy
from murmurhash.mrmr cimport hash64
@ -243,7 +242,6 @@ cdef class StringStore:
cdef int n_length_bytes
cdef int i
cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str))
cdef uint32_t ulength = length
if length < sizeof(string.s):
string.s[0] = <unsigned char>length
memcpy(&string.s[1], chars, length)
@ -301,7 +299,7 @@ cpdef hash_t get_string_id(object string_or_hash) except -1:
try:
return hash_string(string_or_hash)
except:
except: # no-cython-lint
if _try_coerce_to_hash(string_or_hash, &str_hash):
# Coerce the integral key to the expected primitive hash type.
# This ensures that custom/overloaded "primitive" data types
@ -318,6 +316,5 @@ cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
try:
out_hash[0] = key
return True
except:
except: # no-cython-lint
return False

View File

@ -2,7 +2,7 @@
from cymem.cymem cimport Pool
from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
from spacy.typedefs cimport class_t, weight_t
from spacy.typedefs cimport class_t
import pytest
@ -42,32 +42,35 @@ cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
state = <TestState*>state
mem.free(state)
@cytest
@pytest.mark.parametrize("nr_class,beam_width",
[
(2, 3),
(3, 6),
(4, 20),
]
)
[
(2, 3),
(3, 6),
(4, 20),
]
)
def test_init(nr_class, beam_width):
b = Beam(nr_class, beam_width)
assert b.size == 1
assert b.width == beam_width
assert b.nr_class == nr_class
@cytest
def test_init_violn():
MaxViolation()
@cytest
@pytest.mark.parametrize("nr_class,beam_width,length",
[
(2, 3, 3),
(3, 6, 15),
(4, 20, 32),
]
)
[
(2, 3, 3),
(3, 6, 15),
(4, 20, 32),
]
)
def test_initialize(nr_class, beam_width, length):
b = Beam(nr_class, beam_width)
b.initialize(initialize, destroy, length, NULL)
@ -79,11 +82,11 @@ def test_initialize(nr_class, beam_width, length):
@cytest
@pytest.mark.parametrize("nr_class,beam_width,length,extra",
[
(2, 3, 4, None),
(3, 6, 15, u"test beam 1"),
]
)
[
(2, 3, 4, None),
(3, 6, 15, u"test beam 1"),
]
)
def test_initialize_extra(nr_class, beam_width, length, extra):
b = Beam(nr_class, beam_width)
if extra is None:
@ -97,11 +100,11 @@ def test_initialize_extra(nr_class, beam_width, length, extra):
@cytest
@pytest.mark.parametrize("nr_class,beam_width,length",
[
(3, 6, 15),
(4, 20, 32),
]
)
[
(3, 6, 15),
(4, 20, 32),
]
)
def test_transition(nr_class, beam_width, length):
b = Beam(nr_class, beam_width)
b.initialize(initialize, destroy, length, NULL)

View File

@ -1759,7 +1759,7 @@ cdef class Doc:
data["underscore_span"] = {}
if attr not in data["underscore_span"]:
data["underscore_span"][attr] = []
data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id":_span_id})
data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id": _span_id})
for attr in underscore:
if attr not in user_keys:

View File

@ -1,5 +1,4 @@
cimport numpy as np
from libc.string cimport memset
from ..errors import Errors
from ..morphology import Morphology

View File

@ -225,8 +225,8 @@ cdef class Span:
@property
def _(self):
cdef SpanC* span_c = self.span_c()
"""Custom extension attributes registered via `set_extension`."""
cdef SpanC* span_c = self.span_c()
return Underscore(Underscore.span_extensions, self,
start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
@ -933,7 +933,6 @@ cdef class Span:
self.id_ = ent_id_
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
# Don't allow spaces to be the root, if there are
# better candidates