Fix tokens/.

This commit is contained in:
Raphael Mitsch 2023-07-03 15:17:54 +02:00
parent 1ac29fd8df
commit 9f62a49ebb
10 changed files with 65 additions and 80 deletions

View File

@ -47,7 +47,7 @@ jobs:
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
- name: cython-lint
run: |
python -m pip install cython-lint -c requirements.txt --ignore E501,W291
python -m pip install cython-lint -c requirements.txt --ignore E501,W291,E266
cython-lint spacy
tests:

View File

@ -1,7 +1,6 @@
# cython: infer_types=True, bounds_check=False, profile=True
from cymem.cymem cimport Pool
from libc.stdlib cimport free, malloc
from libc.string cimport memcpy, memset
from libc.string cimport memset
import numpy
from thinc.api import get_array_module
@ -10,7 +9,7 @@ from ..attrs cimport MORPH, NORM
from ..lexeme cimport EMPTY_LEXEME, Lexeme
from ..structs cimport LexemeC, TokenC
from ..vocab cimport Vocab
from .doc cimport Doc, set_children_from_heads, token_by_end, token_by_start
from .doc cimport Doc, set_children_from_heads, token_by_start
from .span cimport Span
from .token cimport Token
@ -147,7 +146,7 @@ def _merge(Doc doc, merges):
syntactic root of the span.
RETURNS (Token): The first newly merged token.
"""
cdef int i, merge_index, start, end, token_index, current_span_index, current_offset, offset, span_index
cdef int i, merge_index, start, token_index, current_span_index, current_offset, offset, span_index
cdef Span span
cdef const LexemeC* lex
cdef TokenC* token
@ -165,7 +164,6 @@ def _merge(Doc doc, merges):
merges.sort(key=_get_start)
for merge_index, (span, attributes) in enumerate(merges):
start = span.start
end = span.end
spans.append(span)
# House the new merged token where it starts
token = &doc.c[start]
@ -203,8 +201,9 @@ def _merge(Doc doc, merges):
# for the merged region. To do this, we create a boolean array indicating
# whether the row is to be deleted, then use numpy.delete
if doc.tensor is not None and doc.tensor.size != 0:
doc.tensor = _resize_tensor(doc.tensor,
[(m[0].start, m[0].end) for m in merges])
doc.tensor = _resize_tensor(
doc.tensor, [(m[0].start, m[0].end) for m in merges]
)
# Memorize span roots and sets dependencies of the newly merged
# tokens to the dependencies of their roots.
span_roots = []
@ -345,7 +344,11 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
if to_process_tensor:
xp = get_array_module(doc.tensor)
if xp is numpy:
doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0)
doc.tensor = xp.append(
doc.tensor,
xp.zeros((nb_subtokens, doc.tensor.shape[1]), dtype="float32"),
axis=0
)
else:
shape = (doc.tensor.shape[0] + nb_subtokens, doc.tensor.shape[1])
resized_array = xp.zeros(shape, dtype="float32")
@ -367,7 +370,8 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
token.norm = 0 # reset norm
if to_process_tensor:
# setting the tensors of the split tokens to array of zeros
doc.tensor[token_index + i:token_index + i + 1] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32")
doc.tensor[token_index + i:token_index + i + 1] = \
xp.zeros((1, doc.tensor.shape[1]), dtype="float32")
# Update the character offset of the subtokens
if i != 0:
token.idx = orig_token.idx + idx_offset
@ -455,7 +459,6 @@ def normalize_token_attrs(Vocab vocab, attrs):
def set_token_attrs(Token py_token, attrs):
cdef TokenC* token = py_token.c
cdef const LexemeC* lex = token.lex
cdef Doc doc = py_token.doc
# Assign attributes
for attr_name, attr_value in attrs.items():
if attr_name == "_": # Set extension attributes

View File

@ -61,7 +61,6 @@ cdef class Doc:
cdef int length
cdef int max_length
cdef public object noun_chunks_iterator
cdef object __weakref__

View File

@ -43,14 +43,13 @@ from ..attrs cimport (
attr_id_t,
)
from ..lexeme cimport EMPTY_LEXEME, Lexeme
from ..typedefs cimport attr_t, flags_t
from ..typedefs cimport attr_t
from .token cimport Token
from .. import parts_of_speech, schemas, util
from ..attrs import IDS, intify_attr
from ..compat import copy_reg, pickle
from ..compat import copy_reg
from ..errors import Errors, Warnings
from ..morphology import Morphology
from ..util import get_words_and_spaces
from ._retokenize import Retokenizer
from .underscore import Underscore, get_ext_args
@ -784,7 +783,7 @@ cdef class Doc:
# TODO:
# 1. Test basic data-driven ORTH gazetteer
# 2. Test more nuanced date and currency regex
cdef attr_t entity_type, kb_id, ent_id
cdef attr_t kb_id, ent_id
cdef int ent_start, ent_end
ent_spans = []
for ent_info in ents:
@ -987,7 +986,6 @@ cdef class Doc:
>>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
"""
cdef int i, j
cdef attr_id_t feature
cdef np.ndarray[attr_t, ndim=2] output
# Handle scalar/list inputs of strings/ints for py_attr_ids
# See also #3064
@ -999,8 +997,10 @@ cdef class Doc:
py_attr_ids = [py_attr_ids]
# Allow strings, e.g. 'lemma' or 'LEMMA'
try:
py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
for id_ in py_attr_ids]
py_attr_ids = [
(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
for id_ in py_attr_ids
]
except KeyError as msg:
keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
@ -1030,8 +1030,6 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#count_by
"""
cdef int i
cdef attr_t attr
cdef size_t count
if counts is None:
counts = Counter()
@ -1093,7 +1091,6 @@ cdef class Doc:
cdef int i, col
cdef int32_t abs_head_index
cdef attr_id_t attr_id
cdef TokenC* tokens = self.c
cdef int length = len(array)
if length != len(self):
raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
@ -1508,7 +1505,6 @@ cdef class Doc:
attributes are inherited from the syntactic root of the span.
RETURNS (Token): The first newly merged token.
"""
cdef str tag, lemma, ent_type
attr_len = len(attributes)
span_len = len(spans)
if not attr_len == span_len:
@ -1624,7 +1620,6 @@ cdef class Doc:
for token in char_span[1:]:
token.is_sent_start = False
for span_group in doc_json.get("spans", {}):
spans = []
for span in doc_json["spans"][span_group]:
@ -1769,7 +1764,6 @@ cdef class Doc:
output.fill(255)
cdef int i, j, start_idx, end_idx
cdef bytes byte_string
cdef unsigned char utf8_char
for i, byte_string in enumerate(byte_strings):
j = 0
start_idx = 0
@ -1822,8 +1816,6 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
# note: end is exclusive
cdef TokenC* head
cdef TokenC* child
cdef int i
# Set number of left/right children to 0. We'll increment it in the loops.
for i in range(start, end):

View File

@ -3,7 +3,7 @@ from typing import Generator, List, Tuple
cimport cython
from cython.operator cimport dereference
from libc.stdint cimport int32_t, int64_t
from libc.stdint cimport int32_t
from libcpp.pair cimport pair
from libcpp.unordered_map cimport unordered_map
from libcpp.unordered_set cimport unordered_set
@ -11,7 +11,6 @@ from libcpp.unordered_set cimport unordered_set
import weakref
from murmurhash.mrmr cimport hash64
from preshed.maps cimport map_get_unless_missing
from .. import Errors
@ -372,7 +371,9 @@ cdef class Graph:
>>> assert graph.has_node((0,))
>>> assert graph.has_edge((0,), (1,3), label="agent")
"""
def __init__(self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None):
def __init__(
self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None # no-cython-lint
):
"""Create a Graph object.
doc (Doc): The Doc object the graph will refer to.
@ -443,8 +444,6 @@ cdef class Graph:
be returned, and no new edge will be created. The weight of the edge
will be updated if a weight is specified.
"""
label_hash = self.doc.vocab.strings.as_int(label)
weight_float = weight if weight is not None else 0.0
edge_index = add_edge(
&self.c,
EdgeC(

View File

@ -89,4 +89,3 @@ cdef class MorphAnalysis:
def __repr__(self):
return self.to_json()

View File

@ -1,5 +1,4 @@
cimport numpy as np
from libc.math cimport sqrt
import copy
import warnings
@ -10,11 +9,10 @@ from thinc.api import get_array_module
from ..attrs cimport *
from ..attrs cimport ORTH, attr_id_t
from ..lexeme cimport Lexeme
from ..parts_of_speech cimport univ_pos_t
from ..structs cimport LexemeC, TokenC
from ..structs cimport TokenC
from ..symbols cimport dep
from ..typedefs cimport attr_t, flags_t, hash_t
from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
from ..typedefs cimport attr_t, hash_t
from .doc cimport _get_lca_matrix, get_token_attr
from .token cimport Token
from ..errors import Errors, Warnings
@ -595,7 +593,6 @@ cdef class Span:
"""
return "".join([t.text_with_ws for t in self])
@property
def noun_chunks(self):
"""Iterate over the base noun phrases in the span. Yields base

View File

@ -1,7 +1,7 @@
import struct
import weakref
from copy import deepcopy
from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Union
from typing import Iterable, Optional, Union
import srsly
@ -34,7 +34,7 @@ cdef class SpanGroup:
DOCS: https://spacy.io/api/spangroup
"""
def __init__(self, doc, *, name="", attrs={}, spans=[]):
def __init__(self, doc, *, name="", attrs={}, spans=[]): # no-cython-lint
"""Create a SpanGroup.
doc (Doc): The reference Doc object.
@ -311,7 +311,7 @@ cdef class SpanGroup:
other_attrs = deepcopy(other_group.attrs)
span_group.attrs.update({
key: value for key, value in other_attrs.items() \
key: value for key, value in other_attrs.items()
if key not in span_group.attrs
})
if len(other_group):

View File

@ -98,12 +98,10 @@ cdef class Token:
elif feat_name == SENT_START:
token.sent_start = value
@staticmethod
cdef inline int missing_dep(const TokenC* token) nogil:
return token.dep == MISSING_DEP
@staticmethod
cdef inline int missing_head(const TokenC* token) nogil:
return Token.missing_dep(token)

View File

@ -1,13 +1,11 @@
# cython: infer_types=True
# Compiler crashes on memory view coercion without this. Should report bug.
cimport numpy as np
from cython.view cimport array as cvarray
np.import_array()
import warnings
import numpy
from thinc.api import get_array_module
from ..attrs cimport (
@ -545,9 +543,9 @@ cdef class Token:
def __get__(self):
if self.i + 1 == len(self.doc):
return True
elif self.doc[self.i+1].is_sent_start == None:
elif self.doc[self.i+1].is_sent_start is None:
return None
elif self.doc[self.i+1].is_sent_start == True:
elif self.doc[self.i+1].is_sent_start is True:
return True
else:
return False