Fix tokens/.

2025-08-03 11:50:19 +03:00 · 2023-07-03 15:17:54 +02:00 · 2023-07-03 15:17:54 +02:00 · 9f62a49ebb
commit 9f62a49ebb
parent 1ac29fd8df
10 changed files with 65 additions and 80 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -47,7 +47,7 @@ jobs:
          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
      - name: cython-lint
        run: |
-          python -m pip install cython-lint -c requirements.txt --ignore E501,W291
+          python -m pip install cython-lint -c requirements.txt --ignore E501,W291,E266
          cython-lint spacy

  tests:
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -1,7 +1,6 @@
 # cython: infer_types=True, bounds_check=False, profile=True
 from cymem.cymem cimport Pool
-from libc.stdlib cimport free, malloc
-from libc.string cimport memcpy, memset
+from libc.string cimport memset

 import numpy
 from thinc.api import get_array_module
@ -10,7 +9,7 @@ from ..attrs cimport MORPH, NORM
 from ..lexeme cimport EMPTY_LEXEME, Lexeme
 from ..structs cimport LexemeC, TokenC
 from ..vocab cimport Vocab
-from .doc cimport Doc, set_children_from_heads, token_by_end, token_by_start
+from .doc cimport Doc, set_children_from_heads, token_by_start
 from .span cimport Span
 from .token cimport Token

@ -147,7 +146,7 @@ def _merge(Doc doc, merges):
        syntactic root of the span.
    RETURNS (Token): The first newly merged token.
    """
-    cdef int i, merge_index, start, end, token_index, current_span_index, current_offset, offset, span_index
+    cdef int i, merge_index, start, token_index, current_span_index, current_offset, offset, span_index
    cdef Span span
    cdef const LexemeC* lex
    cdef TokenC* token
@ -165,7 +164,6 @@ def _merge(Doc doc, merges):
    merges.sort(key=_get_start)
    for merge_index, (span, attributes) in enumerate(merges):
        start = span.start
-        end = span.end
        spans.append(span)
        # House the new merged token where it starts
        token = &doc.c[start]
@ -203,8 +201,9 @@ def _merge(Doc doc, merges):
    # for the merged region. To do this, we create a boolean array indicating
    # whether the row is to be deleted, then use numpy.delete
    if doc.tensor is not None and doc.tensor.size != 0:
-        doc.tensor = _resize_tensor(doc.tensor,
-            [(m[0].start, m[0].end) for m in merges])
+        doc.tensor = _resize_tensor(
+            doc.tensor, [(m[0].start, m[0].end) for m in merges]
+        )
    # Memorize span roots and sets dependencies of the newly merged
    # tokens to the dependencies of their roots.
    span_roots = []
@ -267,11 +266,11 @@ def _merge(Doc doc, merges):
            span_index += 1
        if span_index < len(spans) and i == spans[span_index].start:
            # First token in a span
-            doc.c[i - offset] = doc.c[i] # move token to its place
+            doc.c[i - offset] = doc.c[i]  # move token to its place
            offset += (spans[span_index].end - spans[span_index].start) - 1
            in_span = True
        if not in_span:
-            doc.c[i - offset] = doc.c[i] # move token to its place
+            doc.c[i - offset] = doc.c[i]  # move token to its place

    for i in range(doc.length - offset, doc.length):
        memset(&doc.c[i], 0, sizeof(TokenC))
@ -345,7 +344,11 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
    if to_process_tensor:
        xp = get_array_module(doc.tensor)
        if xp is numpy:
-            doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0)
+            doc.tensor = xp.append(
+                doc.tensor,
+                xp.zeros((nb_subtokens, doc.tensor.shape[1]), dtype="float32"),
+                axis=0
+            )
        else:
            shape = (doc.tensor.shape[0] + nb_subtokens, doc.tensor.shape[1])
            resized_array = xp.zeros(shape, dtype="float32")
@ -367,7 +370,8 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
        token.norm = 0  # reset norm
        if to_process_tensor:
            # setting the tensors of the split tokens to array of zeros
-            doc.tensor[token_index + i:token_index + i + 1] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32")
+            doc.tensor[token_index + i:token_index + i + 1] = \
+                xp.zeros((1, doc.tensor.shape[1]), dtype="float32")
        # Update the character offset of the subtokens
        if i != 0:
            token.idx = orig_token.idx + idx_offset
@ -455,7 +459,6 @@ def normalize_token_attrs(Vocab vocab, attrs):
 def set_token_attrs(Token py_token, attrs):
    cdef TokenC* token = py_token.c
    cdef const LexemeC* lex = token.lex
-    cdef Doc doc = py_token.doc
    # Assign attributes
    for attr_name, attr_value in attrs.items():
        if attr_name == "_":  # Set extension attributes
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -31,7 +31,7 @@ cdef int token_by_start(const TokenC* tokens, int length, int start_char) except
 cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2


-cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
+cdef int [:, :] _get_lca_matrix(Doc, int start, int end)


 cdef class Doc:
@ -61,7 +61,6 @@ cdef class Doc:
    cdef int length
    cdef int max_length

-
    cdef public object noun_chunks_iterator

    cdef object __weakref__
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -43,14 +43,13 @@ from ..attrs cimport (
    attr_id_t,
 )
 from ..lexeme cimport EMPTY_LEXEME, Lexeme
-from ..typedefs cimport attr_t, flags_t
+from ..typedefs cimport attr_t
 from .token cimport Token

 from .. import parts_of_speech, schemas, util
 from ..attrs import IDS, intify_attr
-from ..compat import copy_reg, pickle
+from ..compat import copy_reg
 from ..errors import Errors, Warnings
-from ..morphology import Morphology
 from ..util import get_words_and_spaces
 from ._retokenize import Retokenizer
 from .underscore import Underscore, get_ext_args
@ -784,7 +783,7 @@ cdef class Doc:
            # TODO:
            # 1. Test basic data-driven ORTH gazetteer
            # 2. Test more nuanced date and currency regex
-            cdef attr_t entity_type, kb_id, ent_id
+            cdef attr_t kb_id, ent_id
            cdef int ent_start, ent_end
            ent_spans = []
            for ent_info in ents:
@ -987,7 +986,6 @@ cdef class Doc:
            >>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
        """
        cdef int i, j
-        cdef attr_id_t feature
        cdef np.ndarray[attr_t, ndim=2] output
        # Handle scalar/list inputs of strings/ints for py_attr_ids
        # See also #3064
@ -999,8 +997,10 @@ cdef class Doc:
            py_attr_ids = [py_attr_ids]
        # Allow strings, e.g. 'lemma' or 'LEMMA'
        try:
-            py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
-                       for id_ in py_attr_ids]
+            py_attr_ids = [
+                (IDS[id_.upper()] if hasattr(id_, "upper") else id_)
+                for id_ in py_attr_ids
+            ]
        except KeyError as msg:
            keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
            raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
@ -1030,8 +1030,6 @@ cdef class Doc:
        DOCS: https://spacy.io/api/doc#count_by
        """
        cdef int i
-        cdef attr_t attr
-        cdef size_t count

        if counts is None:
            counts = Counter()
@ -1093,7 +1091,6 @@ cdef class Doc:
        cdef int i, col
        cdef int32_t abs_head_index
        cdef attr_id_t attr_id
-        cdef TokenC* tokens = self.c
        cdef int length = len(array)
        if length != len(self):
            raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
@ -1225,7 +1222,7 @@ cdef class Doc:
                            span.label,
                            span.kb_id,
                            span.id,
-                            span.text, # included as a check
+                            span.text,  # included as a check
                        ))
            char_offset += len(doc.text)
            if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
@ -1508,7 +1505,6 @@ cdef class Doc:
            attributes are inherited from the syntactic root of the span.
        RETURNS (Token): The first newly merged token.
        """
-        cdef str tag, lemma, ent_type
        attr_len = len(attributes)
        span_len = len(spans)
        if not attr_len == span_len:
@ -1624,7 +1620,6 @@ cdef class Doc:
                for token in char_span[1:]:
                    token.is_sent_start = False

-
        for span_group in doc_json.get("spans", {}):
            spans = []
            for span in doc_json["spans"][span_group]:
@ -1656,7 +1651,7 @@ cdef class Doc:
                start = token_by_char(self.c, self.length, token_data["start"])
                value = token_data["value"]
                self[start]._.set(token_attr, value)
-                
+
        for span_attr in doc_json.get("underscore_span", {}):
            if not Span.has_extension(span_attr):
                Span.set_extension(span_attr)
@ -1698,7 +1693,7 @@ cdef class Doc:
                token_data["dep"] = token.dep_
                token_data["head"] = token.head.i
            data["tokens"].append(token_data)
-        
+
        if self.spans:
            data["spans"] = {}
            for span_group in self.spans:
@ -1769,7 +1764,6 @@ cdef class Doc:
        output.fill(255)
        cdef int i, j, start_idx, end_idx
        cdef bytes byte_string
-        cdef unsigned char utf8_char
        for i, byte_string in enumerate(byte_strings):
            j = 0
            start_idx = 0
@ -1822,8 +1816,6 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2

 cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
    # note: end is exclusive
-    cdef TokenC* head
-    cdef TokenC* child
    cdef int i
    # Set number of left/right children to 0. We'll increment it in the loops.
    for i in range(start, end):
@ -1923,7 +1915,7 @@ cdef int _get_tokens_lca(Token token_j, Token token_k):
    return -1


-cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
+cdef int [:, :] _get_lca_matrix(Doc doc, int start, int end):
    """Given a doc and a start and end position defining a set of contiguous
    tokens within it, returns a matrix of Lowest Common Ancestors (LCA), where
    LCA[i, j] is the index of the lowest common ancestor among token i and j.
@ -1936,7 +1928,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
    RETURNS (int [:, :]): memoryview of numpy.array[ndim=2, dtype=numpy.int32],
        with shape (n, n), where n = len(doc).
    """
-    cdef int [:,:] lca_matrix
+    cdef int [:, :] lca_matrix
    cdef int j, k
    n_tokens= end - start
    lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
--- a/spacy/tokens/graph.pyx
+++ b/spacy/tokens/graph.pyx
@ -3,7 +3,7 @@ from typing import Generator, List, Tuple

 cimport cython
 from cython.operator cimport dereference
-from libc.stdint cimport int32_t, int64_t
+from libc.stdint cimport int32_t
 from libcpp.pair cimport pair
 from libcpp.unordered_map cimport unordered_map
 from libcpp.unordered_set cimport unordered_set
@ -11,7 +11,6 @@ from libcpp.unordered_set cimport unordered_set
 import weakref

 from murmurhash.mrmr cimport hash64
-from preshed.maps cimport map_get_unless_missing

 from .. import Errors

@ -28,7 +27,7 @@ from .token import Token
 cdef class Edge:
    cdef readonly Graph graph
    cdef readonly int i
-    
+
    def __init__(self, Graph graph, int i):
        self.graph = graph
        self.i = i
@ -44,7 +43,7 @@ cdef class Edge:
    @property
    def head(self) -> "Node":
        return Node(self.graph, self.graph.c.edges[self.i].head)
-    
+
    @property
    def tail(self) -> "Tail":
        return Node(self.graph, self.graph.c.edges[self.i].tail)
@ -70,7 +69,7 @@ cdef class Node:
    def __init__(self, Graph graph, int i):
        """A reference to a node of an annotation graph. Each node is made up of
        an ordered set of zero or more token indices.
-        
+
        Node references are usually created by the Graph object itself, or from
        the Node or Edge objects. You usually won't need to instantiate this
        class yourself.
@ -109,13 +108,13 @@ cdef class Node:
    @property
    def is_none(self) -> bool:
        """Whether the node is a special value, indicating 'none'.
-        
+
        The NoneNode type is returned by the Graph, Edge and Node objects when
        there is no match to a query. It has the same API as Node, but it always
        returns NoneNode, NoneEdge or empty lists for its queries.
        """
        return False
- 
+
    @property
    def doc(self) -> "Doc":
        """The Doc object that the graph refers to."""
@ -130,19 +129,19 @@ cdef class Node:
    def head(self, i=None, label=None) -> "Node":
        """Get the head of the first matching edge, searching by index, label,
        both or neither.
-        
+
        For instance, `node.head(i=1)` will get the head of the second edge that
        this node is a tail of. `node.head(i=1, label="ARG0")` will further
        check that the second edge has the label `"ARG0"`. 
-        
+
        If no matching node can be found, the graph's NoneNode is returned. 
        """
        return self.headed(i=i, label=label)
-    
+
    def tail(self, i=None, label=None) -> "Node":
        """Get the tail of the first matching edge, searching by index, label,
        both or neither.
- 
+
        If no matching node can be found, the graph's NoneNode is returned. 
        """
        return self.tailed(i=i, label=label).tail
@ -171,7 +170,7 @@ cdef class Node:
        cdef vector[int] edge_indices
        self._find_edges(edge_indices, "head", label)
        return [Node(self.graph, self.graph.c.edges[i].head) for i in edge_indices]
-     
+
    def tails(self, label=None) -> List["Node"]:
        """Find all matching tails of this node."""
        cdef vector[int] edge_indices
@ -200,7 +199,7 @@ cdef class Node:
            return NoneEdge(self.graph)
        else:
            return Edge(self.graph, idx)
-    
+
    def tailed(self, i=None, label=None) -> Edge:
        """Find the first matching edge tailed by this node.
        If no matching edge can be found, the graph's NoneEdge is returned.
@ -283,7 +282,7 @@ cdef class NoneEdge(Edge):
    def __init__(self, graph):
        self.graph = graph
        self.i = -1
-   
+
    @property
    def doc(self) -> "Doc":
        return self.graph.doc
@ -291,7 +290,7 @@ cdef class NoneEdge(Edge):
    @property
    def head(self) -> "NoneNode":
        return NoneNode(self.graph)
-    
+
    @property
    def tail(self) -> "NoneNode":
        return NoneNode(self.graph)
@ -319,7 +318,7 @@ cdef class NoneNode(Node):

    def __len__(self):
        return 0
- 
+
    @property
    def is_none(self):
        return -1
@ -340,14 +339,14 @@ cdef class NoneNode(Node):

    def walk_heads(self):
        yield from [] 
-    
+
    def walk_tails(self):
        yield from [] 
- 
+

 cdef class Graph:
    """A set of directed labelled relationships between sets of tokens.
-    
+
    EXAMPLE:
        Construction 1
        >>> graph = Graph(doc, name="srl")
@ -372,7 +371,9 @@ cdef class Graph:
        >>> assert graph.has_node((0,))
        >>> assert graph.has_edge((0,), (1,3), label="agent")
    """
-    def __init__(self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None):
+    def __init__(
+        self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None  # no-cython-lint
+    ):
        """Create a Graph object.

        doc (Doc): The Doc object the graph will refer to.
@ -438,13 +439,11 @@ cdef class Graph:

    def add_edge(self, head, tail, *, label="", weight=None) -> Edge:
        """Add an edge to the graph, connecting two groups of tokens.
-       
+
        If there is already an edge for the (head, tail, label) triple, it will
        be returned, and no new edge will be created. The weight of the edge
        will be updated if a weight is specified.
        """
-        label_hash = self.doc.vocab.strings.as_int(label)
-        weight_float = weight if weight is not None else 0.0
        edge_index = add_edge(
            &self.c,
            EdgeC(
@ -478,11 +477,11 @@ cdef class Graph:
    def has_edge(self, head, tail, label) -> bool:
        """Check whether a (head, tail, label) triple is an edge in the graph."""
        return not self.get_edge(head, tail, label=label).is_none
-    
+
    def add_node(self, indices) -> Node:
        """Add a node to the graph and return it. Nodes refer to ordered sets
        of token indices.
-        
+
        This method is idempotent: if there is already a node for the given
        indices, it is returned without a new node being created.
        """
@ -510,7 +509,7 @@ cdef class Graph:
            return NoneNode(self)
        else:
            return Node(self, node_index)
- 
+
    def has_node(self, tuple indices) -> bool:
        """Check whether the graph has a node for the given indices."""
        return not self.get_node(indices).is_none
@ -570,7 +569,7 @@ cdef int add_node(GraphC* graph, vector[int32_t]& node) nogil:
        graph.roots.insert(index)
        graph.node_map.insert(pair[hash_t, int](key, index))
        return index
- 
+

 cdef int get_node(const GraphC* graph, vector[int32_t] node) nogil:
    key = hash64(&node[0], node.size() * sizeof(node[0]), 0)
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@ -89,4 +89,3 @@ cdef class MorphAnalysis:

    def __repr__(self):
        return self.to_json()
-
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -1,5 +1,4 @@
 cimport numpy as np
-from libc.math cimport sqrt

 import copy
 import warnings
@ -10,11 +9,10 @@ from thinc.api import get_array_module
 from ..attrs cimport *
 from ..attrs cimport ORTH, attr_id_t
 from ..lexeme cimport Lexeme
-from ..parts_of_speech cimport univ_pos_t
-from ..structs cimport LexemeC, TokenC
+from ..structs cimport TokenC
 from ..symbols cimport dep
-from ..typedefs cimport attr_t, flags_t, hash_t
-from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
+from ..typedefs cimport attr_t, hash_t
+from .doc cimport _get_lca_matrix, get_token_attr
 from .token cimport Token

 from ..errors import Errors, Warnings
@ -595,7 +593,6 @@ cdef class Span:
        """
        return "".join([t.text_with_ws for t in self])

-
    @property
    def noun_chunks(self):
        """Iterate over the base noun phrases in the span. Yields base
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@ -1,7 +1,7 @@
 import struct
 import weakref
 from copy import deepcopy
-from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Union
+from typing import Iterable, Optional, Union

 import srsly

@ -34,7 +34,7 @@ cdef class SpanGroup:

    DOCS: https://spacy.io/api/spangroup
    """
-    def __init__(self, doc, *, name="", attrs={}, spans=[]):
+    def __init__(self, doc, *, name="", attrs={}, spans=[]):  # no-cython-lint
        """Create a SpanGroup.

        doc (Doc): The reference Doc object.
@ -311,7 +311,7 @@ cdef class SpanGroup:

            other_attrs = deepcopy(other_group.attrs)
            span_group.attrs.update({
-                key: value for key, value in other_attrs.items() \
+                key: value for key, value in other_attrs.items()
                if key not in span_group.attrs
            })
            if len(other_group):
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@ -26,7 +26,7 @@ cdef class Token:
        cdef Token self = Token.__new__(Token, vocab, doc, offset)
        return self

-    #cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
+    # cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
    #    cdef TokenC token
    #    attrs = normalize_attrs(attrs)

@ -98,12 +98,10 @@ cdef class Token:
        elif feat_name == SENT_START:
            token.sent_start = value

-
    @staticmethod
    cdef inline int missing_dep(const TokenC* token) nogil:
        return token.dep == MISSING_DEP

-
    @staticmethod
    cdef inline int missing_head(const TokenC* token) nogil:
        return Token.missing_dep(token)
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -1,13 +1,11 @@
 # cython: infer_types=True
 # Compiler crashes on memory view coercion without this. Should report bug.
 cimport numpy as np
-from cython.view cimport array as cvarray

 np.import_array()

 import warnings

-import numpy
 from thinc.api import get_array_module

 from ..attrs cimport (
@ -238,7 +236,7 @@ cdef class Token:
        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
        return result.item()
-    
+
    def has_morph(self):
        """Check whether the token has annotated morph information.
        Return False when the morph annotation is unset/missing.
@ -545,9 +543,9 @@ cdef class Token:
        def __get__(self):
            if self.i + 1 == len(self.doc):
                return True
-            elif self.doc[self.i+1].is_sent_start == None:
+            elif self.doc[self.i+1].is_sent_start is None:
                return None
-            elif self.doc[self.i+1].is_sent_start == True:
+            elif self.doc[self.i+1].is_sent_start is True:
                return True
            else:
                return False