cython fixes and cleanup

2025-11-06 02:47:29 +03:00 · 2023-07-19 17:41:29 +02:00 · 2023-07-19 17:41:29 +02:00 · 96f2e30c4b
commit 96f2e30c4b
parent 846472129c
18 changed files with 118 additions and 128 deletions
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -158,7 +158,6 @@ cdef class PhraseMatcher:
        del self._callbacks[key]
        del self._docs[key]

-
    def _add_from_arrays(self, key, specs, *, on_match=None):
        """Add a preprocessed list of specs, with an optional callback.

@ -194,7 +193,6 @@ cdef class PhraseMatcher:
                result = internal_node
            map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)

-
    def add(self, key, docs, *, on_match=None):
        """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
        key, a list of one or more patterns, and (optionally) an on_match callback.
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@ -1,5 +1,5 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import Any, List, Optional, Tuple, TypeVar, cast
+from typing import Any, List, Optional, Tuple, cast

 from libc.stdlib cimport calloc, free, realloc
 from libc.string cimport memcpy, memset
@ -23,7 +23,7 @@ from thinc.api import (

 from thinc.backends.cblas cimport CBlas, saxpy, sgemm

-from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
+from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d

 from ..errors import Errors
 from ..pipeline._parser_internals import _beam_utils
@ -136,7 +136,7 @@ def init(
    Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
 ):
    if X is not None:
-        docs, moves = X
+        docs, _ = X
        model.get_ref("tok2vec").initialize(X=docs)
    else:
        model.get_ref("tok2vec").initialize()
@ -145,7 +145,7 @@ def init(
        current_nO = model.maybe_get_dim("nO")
        if current_nO is None or current_nO != inferred_nO:
            model.attrs["resize_output"](model, inferred_nO)
-    nO = model.get_dim("nO")
+    # nO = model.get_dim("nO")
    nP = model.get_dim("nP")
    nH = model.get_dim("nH")
    nI = model.get_dim("nI")
@ -194,7 +194,8 @@ class TransitionModelInputs:
        moves: TransitionSystem,
        actions: Optional[List[Ints1d]] = None,
        max_moves: int = 0,
-        states: Optional[List[State]]=None):
+        states: Optional[List[State]] = None,
+    ):
        """
        actions (Optional[List[Ints1d]]): actions to apply for each Doc.
        docs (List[Doc]): Docs to predict transition sequences for.
@ -257,9 +258,10 @@ def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[State

    return (states, scores), backprop

+
 cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
                       WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
-    cdef int i, j
+    cdef int i
    cdef vector[StateC *] unfinished
    cdef ActivationsC activations = _alloc_activations(sizes)
    cdef np.ndarray step_scores
@ -371,7 +373,7 @@ def _forward_fallback(
            for clas in set(model.attrs["unseen_classes"]):
                if (d_scores[:, clas] < 0).any():
                    model.attrs["unseen_classes"].remove(clas)
-        d_scores *= seen_mask == False
+        d_scores *= seen_mask == False  # no-cython-lint
        # Calculate the gradients for the parameters of the output layer.
        # The weight gemm is (nS, nO) @ (nS, nH).T
        output.inc_grad("b", d_scores.sum(axis=0))
@ -617,12 +619,12 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
                scores[i*n.classes+j] = min_


-cdef void _sum_state_features(CBlas cblas, float* output,
-        const float* cached, const int* token_ids, SizesC n) nogil:
-    cdef int idx, b, f, i
+cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
+                              const int* token_ids, SizesC n) nogil:
+    cdef int idx, b, f
    cdef const float* feature
    cdef int B = n.states
-    cdef int O = n.hiddens * n.pieces
+    cdef int O = n.hiddens * n.pieces  # no-cython-lint
    cdef int F = n.feats
    cdef int T = n.tokens
    padding = cached + (T * F * O)
@ -637,4 +639,3 @@ cdef void _sum_state_features(CBlas cblas, float* output,
                feature = &cached[idx]
            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
        token_ids += F
-
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -80,7 +80,6 @@ cdef class Morphology:
        out.sort(key=lambda x: x[0])
        return dict(out)

-
    def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
        norm_feats_string = self.FEATURE_SEP.join([
            self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
@ -88,7 +87,6 @@ cdef class Morphology:
            ])
        return norm_feats_string or self.EMPTY_MORPH

-
    cdef hash_t _add(self, features):
        """Insert a morphological analysis in the morphology table, if not
        already present. The morphological analysis may be provided in the UD
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@ -1,5 +1,4 @@
 from cymem.cymem cimport Pool
-from libc.stdint cimport int32_t
 from libcpp.memory cimport shared_ptr
 from libcpp.vector cimport vector

--- a/spacy/pipeline/_parser_internals/search.pxd
+++ b/spacy/pipeline/_parser_internals/search.pxd
@ -58,7 +58,6 @@ cdef class Beam:
                     void* extra_args) except -1
    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1

-
    cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
        self.scores[i][j] = score
        self.is_valid[i][j] = is_valid
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@ -1,11 +1,8 @@
 # cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
-from libc.math cimport exp, log
-from libc.string cimport memcpy, memset
-
-import math
-
 from cymem.cymem cimport Pool
+from libc.math cimport exp
+from libc.string cimport memcpy, memset
 from preshed.maps cimport PreshMap


@ -70,7 +67,7 @@ cdef class Beam:
            self.costs[i][j] = costs[j]

    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
-        cdef int i, j
+        cdef int i
        for i in range(self.width):
            memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
            memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
@ -176,7 +173,6 @@ cdef class Beam:
        beam-width, and n is the number of classes.
        """
        cdef Entry entry
-        cdef weight_t score
        cdef _State* s
        cdef int i, j, move_id
        assert self.size >= 1
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@ -319,4 +319,3 @@ cdef void c_transition_batch(TransitionSystem moves, StateC** states, const floa
            action.do(states[i], action.label)
            states[i].history.push_back(guess)
    free(is_valid)
-
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -1,8 +1,7 @@
 # cython: infer_types=True, profile=True, binding=True
 from itertools import islice
-from typing import Callable, Dict, Iterable, List, Optional, Union
+from typing import Callable, Dict, Iterable, Optional, Union

-import srsly
 from thinc.api import Config, Model
 from thinc.legacy import LegacySequenceCategoricalCrossentropy

--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -1,12 +1,11 @@
 # cython: infer_types=True, profile=True, binding=True
-import warnings
 from typing import Callable, Dict, Iterable, Iterator, Tuple, Union

 import srsly

 from ..tokens.doc cimport Doc

-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..language import Language
 from ..training import Example
 from ..util import raise_error
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@ -1,5 +1,4 @@
 # cython: infer_types=True, profile=True, binding=True
-import warnings
 from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple

 import srsly
@ -8,7 +7,7 @@ from thinc.api import Model, Optimizer, set_dropout_rate
 from ..tokens.doc cimport Doc

 from .. import util
-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..language import Language
 from ..training import Example, validate_distillation_examples, validate_examples
 from ..vocab import Vocab
@ -56,14 +55,14 @@ cdef class TrainablePipe(Pipe):
        except Exception as e:
            error_handler(self.name, self, [doc], e)

-
    def distill(self,
                teacher_pipe: Optional["TrainablePipe"],
                examples: Iterable["Example"],
                *,
                drop: float = 0.0,
                sgd: Optional[Optimizer] = None,
-               losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
+                losses: Optional[Dict[str, float]] = None
+                ) -> Dict[str, float]:
        """Train a pipe (the student) on the predictions of another pipe
        (the teacher). The student is typically trained on the probability
        distribution of the teacher, but details may differ per pipe.
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -227,7 +227,8 @@ class Parser(TrainablePipe):
                *,
                drop: float = 0.0,
                sgd: Optional[Optimizer] = None,
-               losses: Optional[Dict[str, float]]=None):
+                losses: Optional[Dict[str, float]] = None
+                ):
        """Train a pipe (the student) on the predictions of another pipe
        (the teacher). The student is trained on the transition probabilities
        of the teacher.
@ -277,7 +278,9 @@ class Parser(TrainablePipe):
        # teacher's distributions.

        student_inputs = TransitionModelInputs(docs=student_docs,
-            states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves)
+                                               states=[state.copy() for state in states],
+                                               moves=self.moves,
+                                               max_moves=max_moves)
        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
        actions = _states_diff_to_actions(states, student_states)
        teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
@ -294,7 +297,6 @@ class Parser(TrainablePipe):

        return losses

-
    def get_teacher_student_loss(
            self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
            normalize: bool = False,
@ -436,8 +438,10 @@ class Parser(TrainablePipe):
        else:
            init_states, gold_states, _ = self.moves.init_gold_batch(examples)

-        inputs = TransitionModelInputs(docs=docs, moves=self.moves,
-            max_moves=max_moves, states=[state.copy() for state in init_states])
+        inputs = TransitionModelInputs(docs=docs,
+                                       moves=self.moves,
+                                       max_moves=max_moves,
+                                       states=[state.copy() for state in init_states])
        (pred_states, scores), backprop_scores = self.model.begin_update(inputs)
        if sum(s.shape[0] for s in scores) == 0:
            return losses
@ -483,9 +487,7 @@ class Parser(TrainablePipe):
        cdef TransitionSystem moves = self.moves
        cdef StateClass state
        cdef int clas
-        cdef int nF = self.model.get_dim("nF")
        cdef int nO = moves.n_moves
-        cdef int nS = sum([len(history) for history in histories])
        cdef Pool mem = Pool()
        cdef np.ndarray costs_i
        is_valid = <int*>mem.alloc(nO, sizeof(int))
@ -552,8 +554,8 @@ class Parser(TrainablePipe):

        return losses

-    def update_beam(self, examples, *, beam_width,
-            drop=0., sgd=None, losses=None, beam_density=0.0):
+    def update_beam(self, examples, *, beam_width, drop=0.,
+                    sgd=None, losses=None, beam_density=0.0):
        raise NotImplementedError

    def set_output(self, nO):
@ -678,7 +680,8 @@ class Parser(TrainablePipe):
            return states

        # Parse the states that are too long with the teacher's parsing model.
-        teacher_inputs = TransitionModelInputs(docs=docs, moves=moves,
+        teacher_inputs = TransitionModelInputs(docs=docs,
+                                               moves=moves,
                                               states=[state.copy() for state in to_cut])
        (teacher_states, _) = teacher_pipe.model.predict(teacher_inputs)

@ -778,6 +781,7 @@ def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:

    return actions

+
 def _states_diff_to_actions(
    before_states: List[StateClass],
    after_states: List[StateClass]
@ -798,7 +802,8 @@ def _states_diff_to_actions(
        c_state_before = before_state.c
        c_state_after = after_state.c

-        assert equal(c_state_before.history.begin(), c_state_before.history.end(),
+        assert equal(c_state_before.history.begin(),
+                     c_state_before.history.end(),
                     c_state_after.history.begin())

    actions = []
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -1,7 +1,6 @@
 # cython: infer_types=True
-from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Union
+from typing import Iterable, Iterator, List, Optional, Tuple, Union

-cimport cython
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from murmurhash.mrmr cimport hash64
@ -243,7 +242,6 @@ cdef class StringStore:
        cdef int n_length_bytes
        cdef int i
        cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str))
-        cdef uint32_t ulength = length
        if length < sizeof(string.s):
            string.s[0] = <unsigned char>length
            memcpy(&string.s[1], chars, length)
@ -301,7 +299,7 @@ cpdef hash_t get_string_id(object string_or_hash) except -1:

    try:
        return hash_string(string_or_hash)
-    except:
+    except:   # no-cython-lint
        if _try_coerce_to_hash(string_or_hash, &str_hash):
            # Coerce the integral key to the expected primitive hash type.
            # This ensures that custom/overloaded "primitive" data types
@ -318,6 +316,5 @@ cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
    try:
        out_hash[0] = key
        return True
-    except:
+    except:  # no-cython-lint
        return False
-
--- a/spacy/tests/parser/_search.pyx
+++ b/spacy/tests/parser/_search.pyx
@ -2,7 +2,7 @@
 from cymem.cymem cimport Pool

 from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
-from spacy.typedefs cimport class_t, weight_t
+from spacy.typedefs cimport class_t

 import pytest

@ -42,6 +42,7 @@ cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
    state = <TestState*>state
    mem.free(state)

+
@cytest
@pytest.mark.parametrize("nr_class,beam_width",
                         [
@ -56,10 +57,12 @@ def test_init(nr_class, beam_width):
    assert b.width == beam_width
    assert b.nr_class == nr_class

+
@cytest
 def test_init_violn():
    MaxViolation()

+
@cytest
@pytest.mark.parametrize("nr_class,beam_width,length",
                         [
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@ -1,5 +1,4 @@
 cimport numpy as np
-from libc.string cimport memset

 from ..errors import Errors
 from ..morphology import Morphology
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -225,8 +225,8 @@ cdef class Span:

    @property
    def _(self):
-        cdef SpanC* span_c = self.span_c()
        """Custom extension attributes registered via `set_extension`."""
+        cdef SpanC* span_c = self.span_c()
        return Underscore(Underscore.span_extensions, self,
                          start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)

@ -933,7 +933,6 @@ cdef class Span:
            self.id_ = ent_id_


-
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
    # Don't allow spaces to be the root, if there are
    # better candidates