Fix tokens/.

2025-07-10 16:22:29 +03:00 · 2023-07-03 15:17:54 +02:00 · 2023-07-03 15:17:54 +02:00 · 9f62a49ebb
commit 9f62a49ebb
parent 1ac29fd8df
10 changed files with 65 additions and 80 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -47,7 +47,7 @@ jobs:
          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
      - name: cython-lint
        run: |
-          python -m pip install cython-lint -c requirements.txt --ignore E501,W291
+          python -m pip install cython-lint -c requirements.txt --ignore E501,W291,E266
          cython-lint spacy
  tests:
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -1,7 +1,6 @@
 # cython: infer_types=True, bounds_check=False, profile=True
 from cymem.cymem cimport Pool
-from libc.stdlib cimport free, malloc
+from libc.string cimport memset
 from libc.string cimport memcpy, memset
 import numpy
 from thinc.api import get_array_module
@ -10,7 +9,7 @@ from ..attrs cimport MORPH, NORM
 from ..lexeme cimport EMPTY_LEXEME, Lexeme
 from ..structs cimport LexemeC, TokenC
 from ..vocab cimport Vocab
-from .doc cimport Doc, set_children_from_heads, token_by_end, token_by_start
+from .doc cimport Doc, set_children_from_heads, token_by_start
 from .span cimport Span
 from .token cimport Token
@ -147,7 +146,7 @@ def _merge(Doc doc, merges):
        syntactic root of the span.
    RETURNS (Token): The first newly merged token.
    """
-    cdef int i, merge_index, start, end, token_index, current_span_index, current_offset, offset, span_index
+    cdef int i, merge_index, start, token_index, current_span_index, current_offset, offset, span_index
    cdef Span span
    cdef const LexemeC* lex
    cdef TokenC* token
@ -165,7 +164,6 @@ def _merge(Doc doc, merges):
    merges.sort(key=_get_start)
    for merge_index, (span, attributes) in enumerate(merges):
        start = span.start
        end = span.end
        spans.append(span)
        # House the new merged token where it starts
        token = &doc.c[start]
@ -203,8 +201,9 @@ def _merge(Doc doc, merges):
    # for the merged region. To do this, we create a boolean array indicating
    # whether the row is to be deleted, then use numpy.delete
    if doc.tensor is not None and doc.tensor.size != 0:
-        doc.tensor = _resize_tensor(doc.tensor,
+        doc.tensor = _resize_tensor(
-            [(m[0].start, m[0].end) for m in merges])
+            doc.tensor, [(m[0].start, m[0].end) for m in merges]
        )
    # Memorize span roots and sets dependencies of the newly merged
    # tokens to the dependencies of their roots.
    span_roots = []
@ -267,11 +266,11 @@ def _merge(Doc doc, merges):
            span_index += 1
        if span_index < len(spans) and i == spans[span_index].start:
            # First token in a span
-            doc.c[i - offset] = doc.c[i] # move token to its place
+            doc.c[i - offset] = doc.c[i]  # move token to its place
            offset += (spans[span_index].end - spans[span_index].start) - 1
            in_span = True
        if not in_span:
-            doc.c[i - offset] = doc.c[i] # move token to its place
+            doc.c[i - offset] = doc.c[i]  # move token to its place
    for i in range(doc.length - offset, doc.length):
        memset(&doc.c[i], 0, sizeof(TokenC))
@ -345,7 +344,11 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
    if to_process_tensor:
        xp = get_array_module(doc.tensor)
        if xp is numpy:
-            doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0)
+            doc.tensor = xp.append(
                doc.tensor,
                xp.zeros((nb_subtokens, doc.tensor.shape[1]), dtype="float32"),
                axis=0
            )
        else:
            shape = (doc.tensor.shape[0] + nb_subtokens, doc.tensor.shape[1])
            resized_array = xp.zeros(shape, dtype="float32")
@ -367,7 +370,8 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
        token.norm = 0  # reset norm
        if to_process_tensor:
            # setting the tensors of the split tokens to array of zeros
-            doc.tensor[token_index + i:token_index + i + 1] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32")
+            doc.tensor[token_index + i:token_index + i + 1] = \
                xp.zeros((1, doc.tensor.shape[1]), dtype="float32")
        # Update the character offset of the subtokens
        if i != 0:
            token.idx = orig_token.idx + idx_offset
@ -455,7 +459,6 @@ def normalize_token_attrs(Vocab vocab, attrs):
 def set_token_attrs(Token py_token, attrs):
    cdef TokenC* token = py_token.c
    cdef const LexemeC* lex = token.lex
    cdef Doc doc = py_token.doc
    # Assign attributes
    for attr_name, attr_value in attrs.items():
        if attr_name == "_":  # Set extension attributes
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -31,7 +31,7 @@ cdef int token_by_start(const TokenC* tokens, int length, int start_char) except
 cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
-cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
+cdef int [:, :] _get_lca_matrix(Doc, int start, int end)
 cdef class Doc:
@ -61,7 +61,6 @@ cdef class Doc:
    cdef int length
    cdef int max_length
    cdef public object noun_chunks_iterator
    cdef object __weakref__
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -43,14 +43,13 @@ from ..attrs cimport (
    attr_id_t,
 )
 from ..lexeme cimport EMPTY_LEXEME, Lexeme
-from ..typedefs cimport attr_t, flags_t
+from ..typedefs cimport attr_t
 from .token cimport Token
 from .. import parts_of_speech, schemas, util
 from ..attrs import IDS, intify_attr
-from ..compat import copy_reg, pickle
+from ..compat import copy_reg
 from ..errors import Errors, Warnings
 from ..morphology import Morphology
 from ..util import get_words_and_spaces
 from ._retokenize import Retokenizer
 from .underscore import Underscore, get_ext_args
@ -784,7 +783,7 @@ cdef class Doc:
            # TODO:
            # 1. Test basic data-driven ORTH gazetteer
            # 2. Test more nuanced date and currency regex
-            cdef attr_t entity_type, kb_id, ent_id
+            cdef attr_t kb_id, ent_id
            cdef int ent_start, ent_end
            ent_spans = []
            for ent_info in ents:
@ -987,7 +986,6 @@ cdef class Doc:
            >>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
        """
        cdef int i, j
        cdef attr_id_t feature
        cdef np.ndarray[attr_t, ndim=2] output
        # Handle scalar/list inputs of strings/ints for py_attr_ids
        # See also #3064
@ -999,8 +997,10 @@ cdef class Doc:
            py_attr_ids = [py_attr_ids]
        # Allow strings, e.g. 'lemma' or 'LEMMA'
        try:
-            py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
+            py_attr_ids = [
-                       for id_ in py_attr_ids]
+                (IDS[id_.upper()] if hasattr(id_, "upper") else id_)
                for id_ in py_attr_ids
            ]
        except KeyError as msg:
            keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
            raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
@ -1030,8 +1030,6 @@ cdef class Doc:
        DOCS: https://spacy.io/api/doc#count_by
        """
        cdef int i
        cdef attr_t attr
        cdef size_t count
        if counts is None:
            counts = Counter()
@ -1093,7 +1091,6 @@ cdef class Doc:
        cdef int i, col
        cdef int32_t abs_head_index
        cdef attr_id_t attr_id
        cdef TokenC* tokens = self.c
        cdef int length = len(array)
        if length != len(self):
            raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
@ -1225,7 +1222,7 @@ cdef class Doc:
                            span.label,
                            span.kb_id,
                            span.id,
-                            span.text, # included as a check
+                            span.text,  # included as a check
                        ))
            char_offset += len(doc.text)
            if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
@ -1508,7 +1505,6 @@ cdef class Doc:
            attributes are inherited from the syntactic root of the span.
        RETURNS (Token): The first newly merged token.
        """
        cdef str tag, lemma, ent_type
        attr_len = len(attributes)
        span_len = len(spans)
        if not attr_len == span_len:
@ -1624,7 +1620,6 @@ cdef class Doc:
                for token in char_span[1:]:
                    token.is_sent_start = False
        for span_group in doc_json.get("spans", {}):
            spans = []
            for span in doc_json["spans"][span_group]:
@ -1769,7 +1764,6 @@ cdef class Doc:
        output.fill(255)
        cdef int i, j, start_idx, end_idx
        cdef bytes byte_string
        cdef unsigned char utf8_char
        for i, byte_string in enumerate(byte_strings):
            j = 0
            start_idx = 0
@ -1822,8 +1816,6 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2
 cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
    # note: end is exclusive
    cdef TokenC* head
    cdef TokenC* child
    cdef int i
    # Set number of left/right children to 0. We'll increment it in the loops.
    for i in range(start, end):
@ -1923,7 +1915,7 @@ cdef int _get_tokens_lca(Token token_j, Token token_k):
    return -1
-cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
+cdef int [:, :] _get_lca_matrix(Doc doc, int start, int end):
    """Given a doc and a start and end position defining a set of contiguous
    tokens within it, returns a matrix of Lowest Common Ancestors (LCA), where
    LCA[i, j] is the index of the lowest common ancestor among token i and j.
@ -1936,7 +1928,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
    RETURNS (int [:, :]): memoryview of numpy.array[ndim=2, dtype=numpy.int32],
        with shape (n, n), where n = len(doc).
    """
-    cdef int [:,:] lca_matrix
+    cdef int [:, :] lca_matrix
    cdef int j, k
    n_tokens= end - start
    lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
--- a/spacy/tokens/graph.pyx
+++ b/spacy/tokens/graph.pyx
@ -3,7 +3,7 @@ from typing import Generator, List, Tuple
 cimport cython
 from cython.operator cimport dereference
-from libc.stdint cimport int32_t, int64_t
+from libc.stdint cimport int32_t
 from libcpp.pair cimport pair
 from libcpp.unordered_map cimport unordered_map
 from libcpp.unordered_set cimport unordered_set
@ -11,7 +11,6 @@ from libcpp.unordered_set cimport unordered_set
 import weakref
 from murmurhash.mrmr cimport hash64
 from preshed.maps cimport map_get_unless_missing
 from .. import Errors
@ -372,7 +371,9 @@ cdef class Graph:
        >>> assert graph.has_node((0,))
        >>> assert graph.has_edge((0,), (1,3), label="agent")
    """
-    def __init__(self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None):
+    def __init__(
        self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None  # no-cython-lint
    ):
        """Create a Graph object.
        doc (Doc): The Doc object the graph will refer to.
@ -443,8 +444,6 @@ cdef class Graph:
        be returned, and no new edge will be created. The weight of the edge
        will be updated if a weight is specified.
        """
        label_hash = self.doc.vocab.strings.as_int(label)
        weight_float = weight if weight is not None else 0.0
        edge_index = add_edge(
            &self.c,
            EdgeC(
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@ -89,4 +89,3 @@ cdef class MorphAnalysis:
    def __repr__(self):
        return self.to_json()
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -1,5 +1,4 @@
 cimport numpy as np
 from libc.math cimport sqrt
 import copy
 import warnings
@ -10,11 +9,10 @@ from thinc.api import get_array_module
 from ..attrs cimport *
 from ..attrs cimport ORTH, attr_id_t
 from ..lexeme cimport Lexeme
-from ..parts_of_speech cimport univ_pos_t
+from ..structs cimport TokenC
 from ..structs cimport LexemeC, TokenC
 from ..symbols cimport dep
-from ..typedefs cimport attr_t, flags_t, hash_t
+from ..typedefs cimport attr_t, hash_t
-from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
+from .doc cimport _get_lca_matrix, get_token_attr
 from .token cimport Token
 from ..errors import Errors, Warnings
@ -595,7 +593,6 @@ cdef class Span:
        """
        return "".join([t.text_with_ws for t in self])
    @property
    def noun_chunks(self):
        """Iterate over the base noun phrases in the span. Yields base
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@ -1,7 +1,7 @@
 import struct
 import weakref
 from copy import deepcopy
-from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Union
+from typing import Iterable, Optional, Union
 import srsly
@ -34,7 +34,7 @@ cdef class SpanGroup:
    DOCS: https://spacy.io/api/spangroup
    """
-    def __init__(self, doc, *, name="", attrs={}, spans=[]):
+    def __init__(self, doc, *, name="", attrs={}, spans=[]):  # no-cython-lint
        """Create a SpanGroup.
        doc (Doc): The reference Doc object.
@ -311,7 +311,7 @@ cdef class SpanGroup:
            other_attrs = deepcopy(other_group.attrs)
            span_group.attrs.update({
-                key: value for key, value in other_attrs.items() \
+                key: value for key, value in other_attrs.items()
                if key not in span_group.attrs
            })
            if len(other_group):
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@ -26,7 +26,7 @@ cdef class Token:
        cdef Token self = Token.__new__(Token, vocab, doc, offset)
        return self
-    #cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
+    # cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
    #    cdef TokenC token
    #    attrs = normalize_attrs(attrs)
@ -98,12 +98,10 @@ cdef class Token:
        elif feat_name == SENT_START:
            token.sent_start = value
    @staticmethod
    cdef inline int missing_dep(const TokenC* token) nogil:
        return token.dep == MISSING_DEP
    @staticmethod
    cdef inline int missing_head(const TokenC* token) nogil:
        return Token.missing_dep(token)
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -1,13 +1,11 @@
 # cython: infer_types=True
 # Compiler crashes on memory view coercion without this. Should report bug.
 cimport numpy as np
 from cython.view cimport array as cvarray
 np.import_array()
 import warnings
 import numpy
 from thinc.api import get_array_module
 from ..attrs cimport (
@ -545,9 +543,9 @@ cdef class Token:
        def __get__(self):
            if self.i + 1 == len(self.doc):
                return True
-            elif self.doc[self.i+1].is_sent_start == None:
+            elif self.doc[self.i+1].is_sent_start is None:
                return None
-            elif self.doc[self.i+1].is_sent_start == True:
+            elif self.doc[self.i+1].is_sent_start is True:
                return True
            else:
                return False
`@ -89,4 +89,3 @@ cdef class MorphAnalysis:`

	`def __repr__(self):`	`def __repr__(self):`
	`return self.to_json()`	`return self.to_json()`