cython fixes and cleanup

This commit is contained in:
svlandeg 2023-07-19 17:41:29 +02:00
parent 846472129c
commit 96f2e30c4b
18 changed files with 118 additions and 128 deletions

View File

@ -158,7 +158,6 @@ cdef class PhraseMatcher:
del self._callbacks[key]
del self._docs[key]
def _add_from_arrays(self, key, specs, *, on_match=None):
"""Add a preprocessed list of specs, with an optional callback.
@ -194,7 +193,6 @@ cdef class PhraseMatcher:
result = internal_node
map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
def add(self, key, docs, *, on_match=None):
"""Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
key, a list of one or more patterns, and (optionally) an on_match callback.

View File

@ -1,5 +1,5 @@
# cython: infer_types=True, cdivision=True, boundscheck=False
from typing import Any, List, Optional, Tuple, TypeVar, cast
from typing import Any, List, Optional, Tuple, cast
from libc.stdlib cimport calloc, free, realloc
from libc.string cimport memcpy, memset
@ -23,7 +23,7 @@ from thinc.api import (
from thinc.backends.cblas cimport CBlas, saxpy, sgemm
from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
from ..errors import Errors
from ..pipeline._parser_internals import _beam_utils
@ -136,7 +136,7 @@ def init(
Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
):
if X is not None:
docs, moves = X
docs, _ = X
model.get_ref("tok2vec").initialize(X=docs)
else:
model.get_ref("tok2vec").initialize()
@ -145,7 +145,7 @@ def init(
current_nO = model.maybe_get_dim("nO")
if current_nO is None or current_nO != inferred_nO:
model.attrs["resize_output"](model, inferred_nO)
nO = model.get_dim("nO")
# nO = model.get_dim("nO")
nP = model.get_dim("nP")
nH = model.get_dim("nH")
nI = model.get_dim("nI")
@ -194,7 +194,8 @@ class TransitionModelInputs:
moves: TransitionSystem,
actions: Optional[List[Ints1d]] = None,
max_moves: int = 0,
states: Optional[List[State]]=None):
states: Optional[List[State]] = None,
):
"""
actions (Optional[List[Ints1d]]): actions to apply for each Doc.
docs (List[Doc]): Docs to predict transition sequences for.
@ -257,9 +258,10 @@ def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[State
return (states, scores), backprop
cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
cdef int i, j
cdef int i
cdef vector[StateC *] unfinished
cdef ActivationsC activations = _alloc_activations(sizes)
cdef np.ndarray step_scores
@ -371,7 +373,7 @@ def _forward_fallback(
for clas in set(model.attrs["unseen_classes"]):
if (d_scores[:, clas] < 0).any():
model.attrs["unseen_classes"].remove(clas)
d_scores *= seen_mask == False
d_scores *= seen_mask == False # no-cython-lint
# Calculate the gradients for the parameters of the output layer.
# The weight gemm is (nS, nO) @ (nS, nH).T
output.inc_grad("b", d_scores.sum(axis=0))
@ -617,12 +619,12 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
scores[i*n.classes+j] = min_
cdef void _sum_state_features(CBlas cblas, float* output,
const float* cached, const int* token_ids, SizesC n) nogil:
cdef int idx, b, f, i
cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
const int* token_ids, SizesC n) nogil:
cdef int idx, b, f
cdef const float* feature
cdef int B = n.states
cdef int O = n.hiddens * n.pieces
cdef int O = n.hiddens * n.pieces # no-cython-lint
cdef int F = n.feats
cdef int T = n.tokens
padding = cached + (T * F * O)
@ -637,4 +639,3 @@ cdef void _sum_state_features(CBlas cblas, float* output,
feature = &cached[idx]
saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
token_ids += F

View File

@ -80,7 +80,6 @@ cdef class Morphology:
out.sort(key=lambda x: x[0])
return dict(out)
def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
norm_feats_string = self.FEATURE_SEP.join([
self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
@ -88,7 +87,6 @@ cdef class Morphology:
])
return norm_feats_string or self.EMPTY_MORPH
cdef hash_t _add(self, features):
"""Insert a morphological analysis in the morphology table, if not
already present. The morphological analysis may be provided in the UD

View File

@ -1,5 +1,4 @@
from cymem.cymem cimport Pool
from libc.stdint cimport int32_t
from libcpp.memory cimport shared_ptr
from libcpp.vector cimport vector

View File

@ -58,7 +58,6 @@ cdef class Beam:
void* extra_args) except -1
cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
self.scores[i][j] = score
self.is_valid[i][j] = is_valid

View File

@ -1,11 +1,8 @@
# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
cimport cython
from libc.math cimport exp, log
from libc.string cimport memcpy, memset
import math
from cymem.cymem cimport Pool
from libc.math cimport exp
from libc.string cimport memcpy, memset
from preshed.maps cimport PreshMap
@ -70,7 +67,7 @@ cdef class Beam:
self.costs[i][j] = costs[j]
cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
cdef int i, j
cdef int i
for i in range(self.width):
memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
@ -176,7 +173,6 @@ cdef class Beam:
beam-width, and n is the number of classes.
"""
cdef Entry entry
cdef weight_t score
cdef _State* s
cdef int i, j, move_id
assert self.size >= 1

View File

@ -319,4 +319,3 @@ cdef void c_transition_batch(TransitionSystem moves, StateC** states, const floa
action.do(states[i], action.label)
states[i].history.push_back(guess)
free(is_valid)

View File

@ -1,8 +1,7 @@
# cython: infer_types=True, profile=True, binding=True
from itertools import islice
from typing import Callable, Dict, Iterable, List, Optional, Union
from typing import Callable, Dict, Iterable, Optional, Union
import srsly
from thinc.api import Config, Model
from thinc.legacy import LegacySequenceCategoricalCrossentropy

View File

@ -1,12 +1,11 @@
# cython: infer_types=True, profile=True, binding=True
import warnings
from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
import srsly
from ..tokens.doc cimport Doc
from ..errors import Errors, Warnings
from ..errors import Errors
from ..language import Language
from ..training import Example
from ..util import raise_error

View File

@ -1,5 +1,4 @@
# cython: infer_types=True, profile=True, binding=True
import warnings
from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
import srsly
@ -8,7 +7,7 @@ from thinc.api import Model, Optimizer, set_dropout_rate
from ..tokens.doc cimport Doc
from .. import util
from ..errors import Errors, Warnings
from ..errors import Errors
from ..language import Language
from ..training import Example, validate_distillation_examples, validate_examples
from ..vocab import Vocab
@ -56,14 +55,14 @@ cdef class TrainablePipe(Pipe):
except Exception as e:
error_handler(self.name, self, [doc], e)
def distill(self,
teacher_pipe: Optional["TrainablePipe"],
examples: Iterable["Example"],
*,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
losses: Optional[Dict[str, float]] = None
) -> Dict[str, float]:
"""Train a pipe (the student) on the predictions of another pipe
(the teacher). The student is typically trained on the probability
distribution of the teacher, but details may differ per pipe.

View File

@ -227,7 +227,8 @@ class Parser(TrainablePipe):
*,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]]=None):
losses: Optional[Dict[str, float]] = None
):
"""Train a pipe (the student) on the predictions of another pipe
(the teacher). The student is trained on the transition probabilities
of the teacher.
@ -277,7 +278,9 @@ class Parser(TrainablePipe):
# teacher's distributions.
student_inputs = TransitionModelInputs(docs=student_docs,
states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves)
states=[state.copy() for state in states],
moves=self.moves,
max_moves=max_moves)
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
actions = _states_diff_to_actions(states, student_states)
teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
@ -294,7 +297,6 @@ class Parser(TrainablePipe):
return losses
def get_teacher_student_loss(
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
normalize: bool = False,
@ -436,8 +438,10 @@ class Parser(TrainablePipe):
else:
init_states, gold_states, _ = self.moves.init_gold_batch(examples)
inputs = TransitionModelInputs(docs=docs, moves=self.moves,
max_moves=max_moves, states=[state.copy() for state in init_states])
inputs = TransitionModelInputs(docs=docs,
moves=self.moves,
max_moves=max_moves,
states=[state.copy() for state in init_states])
(pred_states, scores), backprop_scores = self.model.begin_update(inputs)
if sum(s.shape[0] for s in scores) == 0:
return losses
@ -483,9 +487,7 @@ class Parser(TrainablePipe):
cdef TransitionSystem moves = self.moves
cdef StateClass state
cdef int clas
cdef int nF = self.model.get_dim("nF")
cdef int nO = moves.n_moves
cdef int nS = sum([len(history) for history in histories])
cdef Pool mem = Pool()
cdef np.ndarray costs_i
is_valid = <int*>mem.alloc(nO, sizeof(int))
@ -552,8 +554,8 @@ class Parser(TrainablePipe):
return losses
def update_beam(self, examples, *, beam_width,
drop=0., sgd=None, losses=None, beam_density=0.0):
def update_beam(self, examples, *, beam_width, drop=0.,
sgd=None, losses=None, beam_density=0.0):
raise NotImplementedError
def set_output(self, nO):
@ -678,7 +680,8 @@ class Parser(TrainablePipe):
return states
# Parse the states that are too long with the teacher's parsing model.
teacher_inputs = TransitionModelInputs(docs=docs, moves=moves,
teacher_inputs = TransitionModelInputs(docs=docs,
moves=moves,
states=[state.copy() for state in to_cut])
(teacher_states, _) = teacher_pipe.model.predict(teacher_inputs)
@ -778,6 +781,7 @@ def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
return actions
def _states_diff_to_actions(
before_states: List[StateClass],
after_states: List[StateClass]
@ -798,7 +802,8 @@ def _states_diff_to_actions(
c_state_before = before_state.c
c_state_after = after_state.c
assert equal(c_state_before.history.begin(), c_state_before.history.end(),
assert equal(c_state_before.history.begin(),
c_state_before.history.end(),
c_state_after.history.begin())
actions = []

View File

@ -1,7 +1,6 @@
# cython: infer_types=True
from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Union
from typing import Iterable, Iterator, List, Optional, Tuple, Union
cimport cython
from libc.stdint cimport uint32_t
from libc.string cimport memcpy
from murmurhash.mrmr cimport hash64
@ -243,7 +242,6 @@ cdef class StringStore:
cdef int n_length_bytes
cdef int i
cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str))
cdef uint32_t ulength = length
if length < sizeof(string.s):
string.s[0] = <unsigned char>length
memcpy(&string.s[1], chars, length)
@ -301,7 +299,7 @@ cpdef hash_t get_string_id(object string_or_hash) except -1:
try:
return hash_string(string_or_hash)
except:
except: # no-cython-lint
if _try_coerce_to_hash(string_or_hash, &str_hash):
# Coerce the integral key to the expected primitive hash type.
# This ensures that custom/overloaded "primitive" data types
@ -318,6 +316,5 @@ cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
try:
out_hash[0] = key
return True
except:
except: # no-cython-lint
return False

View File

@ -2,7 +2,7 @@
from cymem.cymem cimport Pool
from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
from spacy.typedefs cimport class_t, weight_t
from spacy.typedefs cimport class_t
import pytest
@ -42,6 +42,7 @@ cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
state = <TestState*>state
mem.free(state)
@cytest
@pytest.mark.parametrize("nr_class,beam_width",
[
@ -56,10 +57,12 @@ def test_init(nr_class, beam_width):
assert b.width == beam_width
assert b.nr_class == nr_class
@cytest
def test_init_violn():
MaxViolation()
@cytest
@pytest.mark.parametrize("nr_class,beam_width,length",
[

View File

@ -1,5 +1,4 @@
cimport numpy as np
from libc.string cimport memset
from ..errors import Errors
from ..morphology import Morphology

View File

@ -225,8 +225,8 @@ cdef class Span:
@property
def _(self):
cdef SpanC* span_c = self.span_c()
"""Custom extension attributes registered via `set_extension`."""
cdef SpanC* span_c = self.span_c()
return Underscore(Underscore.span_extensions, self,
start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
@ -933,7 +933,6 @@ cdef class Span:
self.id_ = ent_id_
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
# Don't allow spaces to be the root, if there are
# better candidates