cython fixes and cleanup

2025-09-18 10:02:40 +03:00 · 2023-07-19 17:41:29 +02:00 · 2023-07-19 17:41:29 +02:00 · 96f2e30c4b
commit 96f2e30c4b
parent 846472129c
18 changed files with 118 additions and 128 deletions
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -158,7 +158,6 @@ cdef class PhraseMatcher:
        del self._callbacks[key]
        del self._docs[key]
    def _add_from_arrays(self, key, specs, *, on_match=None):
        """Add a preprocessed list of specs, with an optional callback.
@ -194,7 +193,6 @@ cdef class PhraseMatcher:
                result = internal_node
            map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
    def add(self, key, docs, *, on_match=None):
        """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
        key, a list of one or more patterns, and (optionally) an on_match callback.
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@ -1,5 +1,5 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import Any, List, Optional, Tuple, TypeVar, cast
+from typing import Any, List, Optional, Tuple, cast
 from libc.stdlib cimport calloc, free, realloc
 from libc.string cimport memcpy, memset
@ -23,7 +23,7 @@ from thinc.api import (
 from thinc.backends.cblas cimport CBlas, saxpy, sgemm
-from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
+from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
 from ..errors import Errors
 from ..pipeline._parser_internals import _beam_utils
@ -136,7 +136,7 @@ def init(
    Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
 ):
    if X is not None:
-        docs, moves = X
+        docs, _ = X
        model.get_ref("tok2vec").initialize(X=docs)
    else:
        model.get_ref("tok2vec").initialize()
@ -145,7 +145,7 @@ def init(
        current_nO = model.maybe_get_dim("nO")
        if current_nO is None or current_nO != inferred_nO:
            model.attrs["resize_output"](model, inferred_nO)
-    nO = model.get_dim("nO")
+    # nO = model.get_dim("nO")
    nP = model.get_dim("nP")
    nH = model.get_dim("nH")
    nI = model.get_dim("nI")
@ -194,7 +194,8 @@ class TransitionModelInputs:
        moves: TransitionSystem,
        actions: Optional[List[Ints1d]] = None,
        max_moves: int = 0,
-        states: Optional[List[State]]=None):
+        states: Optional[List[State]] = None,
    ):
        """
        actions (Optional[List[Ints1d]]): actions to apply for each Doc.
        docs (List[Doc]): Docs to predict transition sequences for.
@ -257,9 +258,10 @@ def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[State
    return (states, scores), backprop
 cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
                       WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
-    cdef int i, j
+    cdef int i
    cdef vector[StateC *] unfinished
    cdef ActivationsC activations = _alloc_activations(sizes)
    cdef np.ndarray step_scores
@ -371,7 +373,7 @@ def _forward_fallback(
            for clas in set(model.attrs["unseen_classes"]):
                if (d_scores[:, clas] < 0).any():
                    model.attrs["unseen_classes"].remove(clas)
-        d_scores *= seen_mask == False
+        d_scores *= seen_mask == False  # no-cython-lint
        # Calculate the gradients for the parameters of the output layer.
        # The weight gemm is (nS, nO) @ (nS, nH).T
        output.inc_grad("b", d_scores.sum(axis=0))
@ -617,12 +619,12 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
                scores[i*n.classes+j] = min_
-cdef void _sum_state_features(CBlas cblas, float* output,
+cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
-        const float* cached, const int* token_ids, SizesC n) nogil:
+                              const int* token_ids, SizesC n) nogil:
-    cdef int idx, b, f, i
+    cdef int idx, b, f
    cdef const float* feature
    cdef int B = n.states
-    cdef int O = n.hiddens * n.pieces
+    cdef int O = n.hiddens * n.pieces  # no-cython-lint
    cdef int F = n.feats
    cdef int T = n.tokens
    padding = cached + (T * F * O)
@ -637,4 +639,3 @@ cdef void _sum_state_features(CBlas cblas, float* output,
                feature = &cached[idx]
            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
        token_ids += F
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -80,7 +80,6 @@ cdef class Morphology:
        out.sort(key=lambda x: x[0])
        return dict(out)
    def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
        norm_feats_string = self.FEATURE_SEP.join([
            self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
@ -88,7 +87,6 @@ cdef class Morphology:
            ])
        return norm_feats_string or self.EMPTY_MORPH
    cdef hash_t _add(self, features):
        """Insert a morphological analysis in the morphology table, if not
        already present. The morphological analysis may be provided in the UD
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@ -1,5 +1,4 @@
 from cymem.cymem cimport Pool
 from libc.stdint cimport int32_t
 from libcpp.memory cimport shared_ptr
 from libcpp.vector cimport vector
--- a/spacy/pipeline/_parser_internals/search.pxd
+++ b/spacy/pipeline/_parser_internals/search.pxd
@ -58,7 +58,6 @@ cdef class Beam:
                     void* extra_args) except -1
    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
    cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
        self.scores[i][j] = score
        self.is_valid[i][j] = is_valid
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@ -1,11 +1,8 @@
 # cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
 from libc.math cimport exp, log
 from libc.string cimport memcpy, memset
 import math
 from cymem.cymem cimport Pool
 from libc.math cimport exp
 from libc.string cimport memcpy, memset
 from preshed.maps cimport PreshMap
@ -70,7 +67,7 @@ cdef class Beam:
            self.costs[i][j] = costs[j]
    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
-        cdef int i, j
+        cdef int i
        for i in range(self.width):
            memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
            memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
@ -176,7 +173,6 @@ cdef class Beam:
        beam-width, and n is the number of classes.
        """
        cdef Entry entry
        cdef weight_t score
        cdef _State* s
        cdef int i, j, move_id
        assert self.size >= 1
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@ -319,4 +319,3 @@ cdef void c_transition_batch(TransitionSystem moves, StateC** states, const floa
            action.do(states[i], action.label)
            states[i].history.push_back(guess)
    free(is_valid)
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -1,8 +1,7 @@
 # cython: infer_types=True, profile=True, binding=True
 from itertools import islice
-from typing import Callable, Dict, Iterable, List, Optional, Union
+from typing import Callable, Dict, Iterable, Optional, Union
 import srsly
 from thinc.api import Config, Model
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -1,12 +1,11 @@
 # cython: infer_types=True, profile=True, binding=True
 import warnings
 from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
 import srsly
 from ..tokens.doc cimport Doc
-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..language import Language
 from ..training import Example
 from ..util import raise_error
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@ -1,5 +1,4 @@
 # cython: infer_types=True, profile=True, binding=True
 import warnings
 from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
 import srsly
@ -8,7 +7,7 @@ from thinc.api import Model, Optimizer, set_dropout_rate
 from ..tokens.doc cimport Doc
 from .. import util
-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..language import Language
 from ..training import Example, validate_distillation_examples, validate_examples
 from ..vocab import Vocab
@ -56,14 +55,14 @@ cdef class TrainablePipe(Pipe):
        except Exception as e:
            error_handler(self.name, self, [doc], e)
    def distill(self,
                teacher_pipe: Optional["TrainablePipe"],
                examples: Iterable["Example"],
                *,
                drop: float = 0.0,
                sgd: Optional[Optimizer] = None,
-               losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
+                losses: Optional[Dict[str, float]] = None
                ) -> Dict[str, float]:
        """Train a pipe (the student) on the predictions of another pipe
        (the teacher). The student is typically trained on the probability
        distribution of the teacher, but details may differ per pipe.
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -227,7 +227,8 @@ class Parser(TrainablePipe):
                *,
                drop: float = 0.0,
                sgd: Optional[Optimizer] = None,
-               losses: Optional[Dict[str, float]]=None):
+                losses: Optional[Dict[str, float]] = None
                ):
        """Train a pipe (the student) on the predictions of another pipe
        (the teacher). The student is trained on the transition probabilities
        of the teacher.
@ -277,7 +278,9 @@ class Parser(TrainablePipe):
        # teacher's distributions.
        student_inputs = TransitionModelInputs(docs=student_docs,
-            states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves)
+                                               states=[state.copy() for state in states],
                                               moves=self.moves,
                                               max_moves=max_moves)
        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
        actions = _states_diff_to_actions(states, student_states)
        teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
@ -294,7 +297,6 @@ class Parser(TrainablePipe):
        return losses
    def get_teacher_student_loss(
            self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
            normalize: bool = False,
@ -436,8 +438,10 @@ class Parser(TrainablePipe):
        else:
            init_states, gold_states, _ = self.moves.init_gold_batch(examples)
-        inputs = TransitionModelInputs(docs=docs, moves=self.moves,
+        inputs = TransitionModelInputs(docs=docs,
-            max_moves=max_moves, states=[state.copy() for state in init_states])
+                                       moves=self.moves,
                                       max_moves=max_moves,
                                       states=[state.copy() for state in init_states])
        (pred_states, scores), backprop_scores = self.model.begin_update(inputs)
        if sum(s.shape[0] for s in scores) == 0:
            return losses
@ -483,9 +487,7 @@ class Parser(TrainablePipe):
        cdef TransitionSystem moves = self.moves
        cdef StateClass state
        cdef int clas
        cdef int nF = self.model.get_dim("nF")
        cdef int nO = moves.n_moves
        cdef int nS = sum([len(history) for history in histories])
        cdef Pool mem = Pool()
        cdef np.ndarray costs_i
        is_valid = <int*>mem.alloc(nO, sizeof(int))
@ -552,8 +554,8 @@ class Parser(TrainablePipe):
        return losses
-    def update_beam(self, examples, *, beam_width,
+    def update_beam(self, examples, *, beam_width, drop=0.,
-            drop=0., sgd=None, losses=None, beam_density=0.0):
+                    sgd=None, losses=None, beam_density=0.0):
        raise NotImplementedError
    def set_output(self, nO):
@ -678,7 +680,8 @@ class Parser(TrainablePipe):
            return states
        # Parse the states that are too long with the teacher's parsing model.
-        teacher_inputs = TransitionModelInputs(docs=docs, moves=moves,
+        teacher_inputs = TransitionModelInputs(docs=docs,
                                               moves=moves,
                                               states=[state.copy() for state in to_cut])
        (teacher_states, _) = teacher_pipe.model.predict(teacher_inputs)
@ -778,6 +781,7 @@ def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
    return actions
 def _states_diff_to_actions(
    before_states: List[StateClass],
    after_states: List[StateClass]
@ -798,7 +802,8 @@ def _states_diff_to_actions(
        c_state_before = before_state.c
        c_state_after = after_state.c
-        assert equal(c_state_before.history.begin(), c_state_before.history.end(),
+        assert equal(c_state_before.history.begin(),
                     c_state_before.history.end(),
                     c_state_after.history.begin())
    actions = []
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -1,7 +1,6 @@
 # cython: infer_types=True
-from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Union
+from typing import Iterable, Iterator, List, Optional, Tuple, Union
 cimport cython
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from murmurhash.mrmr cimport hash64
@ -243,7 +242,6 @@ cdef class StringStore:
        cdef int n_length_bytes
        cdef int i
        cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str))
        cdef uint32_t ulength = length
        if length < sizeof(string.s):
            string.s[0] = <unsigned char>length
            memcpy(&string.s[1], chars, length)
@ -301,7 +299,7 @@ cpdef hash_t get_string_id(object string_or_hash) except -1:
    try:
        return hash_string(string_or_hash)
-    except:
+    except:   # no-cython-lint
        if _try_coerce_to_hash(string_or_hash, &str_hash):
            # Coerce the integral key to the expected primitive hash type.
            # This ensures that custom/overloaded "primitive" data types
@ -318,6 +316,5 @@ cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
    try:
        out_hash[0] = key
        return True
-    except:
+    except:  # no-cython-lint
        return False
--- a/spacy/tests/parser/_search.pyx
+++ b/spacy/tests/parser/_search.pyx
@ -2,7 +2,7 @@
 from cymem.cymem cimport Pool
 from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
-from spacy.typedefs cimport class_t, weight_t
+from spacy.typedefs cimport class_t
 import pytest
@ -42,6 +42,7 @@ cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
    state = <TestState*>state
    mem.free(state)
@cytest
@pytest.mark.parametrize("nr_class,beam_width",
                         [
@ -56,10 +57,12 @@ def test_init(nr_class, beam_width):
    assert b.width == beam_width
    assert b.nr_class == nr_class
@cytest
 def test_init_violn():
    MaxViolation()
@cytest
@pytest.mark.parametrize("nr_class,beam_width,length",
                         [
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@ -1,5 +1,4 @@
 cimport numpy as np
 from libc.string cimport memset
 from ..errors import Errors
 from ..morphology import Morphology
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -225,8 +225,8 @@ cdef class Span:
    @property
    def _(self):
        cdef SpanC* span_c = self.span_c()
        """Custom extension attributes registered via `set_extension`."""
        cdef SpanC* span_c = self.span_c()
        return Underscore(Underscore.span_extensions, self,
                          start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
@ -933,7 +933,6 @@ cdef class Span:
            self.id_ = ent_id_
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
    # Don't allow spaces to be the root, if there are
    # better candidates