Fix tokens/.

This commit is contained in:
Raphael Mitsch 2023-07-03 15:17:54 +02:00
parent 1ac29fd8df
commit 9f62a49ebb
10 changed files with 65 additions and 80 deletions

View File

@ -47,7 +47,7 @@ jobs:
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
- name: cython-lint
run: |
python -m pip install cython-lint -c requirements.txt --ignore E501,W291
python -m pip install cython-lint -c requirements.txt --ignore E501,W291,E266
cython-lint spacy
tests:

View File

@ -1,7 +1,6 @@
# cython: infer_types=True, bounds_check=False, profile=True
from cymem.cymem cimport Pool
from libc.stdlib cimport free, malloc
from libc.string cimport memcpy, memset
from libc.string cimport memset
import numpy
from thinc.api import get_array_module
@ -10,7 +9,7 @@ from ..attrs cimport MORPH, NORM
from ..lexeme cimport EMPTY_LEXEME, Lexeme
from ..structs cimport LexemeC, TokenC
from ..vocab cimport Vocab
from .doc cimport Doc, set_children_from_heads, token_by_end, token_by_start
from .doc cimport Doc, set_children_from_heads, token_by_start
from .span cimport Span
from .token cimport Token
@ -147,7 +146,7 @@ def _merge(Doc doc, merges):
syntactic root of the span.
RETURNS (Token): The first newly merged token.
"""
cdef int i, merge_index, start, end, token_index, current_span_index, current_offset, offset, span_index
cdef int i, merge_index, start, token_index, current_span_index, current_offset, offset, span_index
cdef Span span
cdef const LexemeC* lex
cdef TokenC* token
@ -165,7 +164,6 @@ def _merge(Doc doc, merges):
merges.sort(key=_get_start)
for merge_index, (span, attributes) in enumerate(merges):
start = span.start
end = span.end
spans.append(span)
# House the new merged token where it starts
token = &doc.c[start]
@ -203,8 +201,9 @@ def _merge(Doc doc, merges):
# for the merged region. To do this, we create a boolean array indicating
# whether the row is to be deleted, then use numpy.delete
if doc.tensor is not None and doc.tensor.size != 0:
doc.tensor = _resize_tensor(doc.tensor,
[(m[0].start, m[0].end) for m in merges])
doc.tensor = _resize_tensor(
doc.tensor, [(m[0].start, m[0].end) for m in merges]
)
# Memorize span roots and sets dependencies of the newly merged
# tokens to the dependencies of their roots.
span_roots = []
@ -267,11 +266,11 @@ def _merge(Doc doc, merges):
span_index += 1
if span_index < len(spans) and i == spans[span_index].start:
# First token in a span
doc.c[i - offset] = doc.c[i] # move token to its place
doc.c[i - offset] = doc.c[i] # move token to its place
offset += (spans[span_index].end - spans[span_index].start) - 1
in_span = True
if not in_span:
doc.c[i - offset] = doc.c[i] # move token to its place
doc.c[i - offset] = doc.c[i] # move token to its place
for i in range(doc.length - offset, doc.length):
memset(&doc.c[i], 0, sizeof(TokenC))
@ -345,7 +344,11 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
if to_process_tensor:
xp = get_array_module(doc.tensor)
if xp is numpy:
doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0)
doc.tensor = xp.append(
doc.tensor,
xp.zeros((nb_subtokens, doc.tensor.shape[1]), dtype="float32"),
axis=0
)
else:
shape = (doc.tensor.shape[0] + nb_subtokens, doc.tensor.shape[1])
resized_array = xp.zeros(shape, dtype="float32")
@ -367,7 +370,8 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
token.norm = 0 # reset norm
if to_process_tensor:
# setting the tensors of the split tokens to array of zeros
doc.tensor[token_index + i:token_index + i + 1] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32")
doc.tensor[token_index + i:token_index + i + 1] = \
xp.zeros((1, doc.tensor.shape[1]), dtype="float32")
# Update the character offset of the subtokens
if i != 0:
token.idx = orig_token.idx + idx_offset
@ -455,7 +459,6 @@ def normalize_token_attrs(Vocab vocab, attrs):
def set_token_attrs(Token py_token, attrs):
cdef TokenC* token = py_token.c
cdef const LexemeC* lex = token.lex
cdef Doc doc = py_token.doc
# Assign attributes
for attr_name, attr_value in attrs.items():
if attr_name == "_": # Set extension attributes

View File

@ -31,7 +31,7 @@ cdef int token_by_start(const TokenC* tokens, int length, int start_char) except
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
cdef int [:, :] _get_lca_matrix(Doc, int start, int end)
cdef class Doc:
@ -61,7 +61,6 @@ cdef class Doc:
cdef int length
cdef int max_length
cdef public object noun_chunks_iterator
cdef object __weakref__

View File

@ -43,14 +43,13 @@ from ..attrs cimport (
attr_id_t,
)
from ..lexeme cimport EMPTY_LEXEME, Lexeme
from ..typedefs cimport attr_t, flags_t
from ..typedefs cimport attr_t
from .token cimport Token
from .. import parts_of_speech, schemas, util
from ..attrs import IDS, intify_attr
from ..compat import copy_reg, pickle
from ..compat import copy_reg
from ..errors import Errors, Warnings
from ..morphology import Morphology
from ..util import get_words_and_spaces
from ._retokenize import Retokenizer
from .underscore import Underscore, get_ext_args
@ -784,7 +783,7 @@ cdef class Doc:
# TODO:
# 1. Test basic data-driven ORTH gazetteer
# 2. Test more nuanced date and currency regex
cdef attr_t entity_type, kb_id, ent_id
cdef attr_t kb_id, ent_id
cdef int ent_start, ent_end
ent_spans = []
for ent_info in ents:
@ -987,7 +986,6 @@ cdef class Doc:
>>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
"""
cdef int i, j
cdef attr_id_t feature
cdef np.ndarray[attr_t, ndim=2] output
# Handle scalar/list inputs of strings/ints for py_attr_ids
# See also #3064
@ -999,8 +997,10 @@ cdef class Doc:
py_attr_ids = [py_attr_ids]
# Allow strings, e.g. 'lemma' or 'LEMMA'
try:
py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
for id_ in py_attr_ids]
py_attr_ids = [
(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
for id_ in py_attr_ids
]
except KeyError as msg:
keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
@ -1030,8 +1030,6 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#count_by
"""
cdef int i
cdef attr_t attr
cdef size_t count
if counts is None:
counts = Counter()
@ -1093,7 +1091,6 @@ cdef class Doc:
cdef int i, col
cdef int32_t abs_head_index
cdef attr_id_t attr_id
cdef TokenC* tokens = self.c
cdef int length = len(array)
if length != len(self):
raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
@ -1225,7 +1222,7 @@ cdef class Doc:
span.label,
span.kb_id,
span.id,
span.text, # included as a check
span.text, # included as a check
))
char_offset += len(doc.text)
if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
@ -1508,7 +1505,6 @@ cdef class Doc:
attributes are inherited from the syntactic root of the span.
RETURNS (Token): The first newly merged token.
"""
cdef str tag, lemma, ent_type
attr_len = len(attributes)
span_len = len(spans)
if not attr_len == span_len:
@ -1624,7 +1620,6 @@ cdef class Doc:
for token in char_span[1:]:
token.is_sent_start = False
for span_group in doc_json.get("spans", {}):
spans = []
for span in doc_json["spans"][span_group]:
@ -1656,7 +1651,7 @@ cdef class Doc:
start = token_by_char(self.c, self.length, token_data["start"])
value = token_data["value"]
self[start]._.set(token_attr, value)
for span_attr in doc_json.get("underscore_span", {}):
if not Span.has_extension(span_attr):
Span.set_extension(span_attr)
@ -1698,7 +1693,7 @@ cdef class Doc:
token_data["dep"] = token.dep_
token_data["head"] = token.head.i
data["tokens"].append(token_data)
if self.spans:
data["spans"] = {}
for span_group in self.spans:
@ -1769,7 +1764,6 @@ cdef class Doc:
output.fill(255)
cdef int i, j, start_idx, end_idx
cdef bytes byte_string
cdef unsigned char utf8_char
for i, byte_string in enumerate(byte_strings):
j = 0
start_idx = 0
@ -1822,8 +1816,6 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
# note: end is exclusive
cdef TokenC* head
cdef TokenC* child
cdef int i
# Set number of left/right children to 0. We'll increment it in the loops.
for i in range(start, end):
@ -1923,7 +1915,7 @@ cdef int _get_tokens_lca(Token token_j, Token token_k):
return -1
cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
cdef int [:, :] _get_lca_matrix(Doc doc, int start, int end):
"""Given a doc and a start and end position defining a set of contiguous
tokens within it, returns a matrix of Lowest Common Ancestors (LCA), where
LCA[i, j] is the index of the lowest common ancestor among token i and j.
@ -1936,7 +1928,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
RETURNS (int [:, :]): memoryview of numpy.array[ndim=2, dtype=numpy.int32],
with shape (n, n), where n = len(doc).
"""
cdef int [:,:] lca_matrix
cdef int [:, :] lca_matrix
cdef int j, k
n_tokens= end - start
lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)

View File

@ -3,7 +3,7 @@ from typing import Generator, List, Tuple
cimport cython
from cython.operator cimport dereference
from libc.stdint cimport int32_t, int64_t
from libc.stdint cimport int32_t
from libcpp.pair cimport pair
from libcpp.unordered_map cimport unordered_map
from libcpp.unordered_set cimport unordered_set
@ -11,7 +11,6 @@ from libcpp.unordered_set cimport unordered_set
import weakref
from murmurhash.mrmr cimport hash64
from preshed.maps cimport map_get_unless_missing
from .. import Errors
@ -28,7 +27,7 @@ from .token import Token
cdef class Edge:
cdef readonly Graph graph
cdef readonly int i
def __init__(self, Graph graph, int i):
self.graph = graph
self.i = i
@ -44,7 +43,7 @@ cdef class Edge:
@property
def head(self) -> "Node":
return Node(self.graph, self.graph.c.edges[self.i].head)
@property
def tail(self) -> "Tail":
return Node(self.graph, self.graph.c.edges[self.i].tail)
@ -70,7 +69,7 @@ cdef class Node:
def __init__(self, Graph graph, int i):
"""A reference to a node of an annotation graph. Each node is made up of
an ordered set of zero or more token indices.
Node references are usually created by the Graph object itself, or from
the Node or Edge objects. You usually won't need to instantiate this
class yourself.
@ -109,13 +108,13 @@ cdef class Node:
@property
def is_none(self) -> bool:
"""Whether the node is a special value, indicating 'none'.
The NoneNode type is returned by the Graph, Edge and Node objects when
there is no match to a query. It has the same API as Node, but it always
returns NoneNode, NoneEdge or empty lists for its queries.
"""
return False
@property
def doc(self) -> "Doc":
"""The Doc object that the graph refers to."""
@ -130,19 +129,19 @@ cdef class Node:
def head(self, i=None, label=None) -> "Node":
"""Get the head of the first matching edge, searching by index, label,
both or neither.
For instance, `node.head(i=1)` will get the head of the second edge that
this node is a tail of. `node.head(i=1, label="ARG0")` will further
check that the second edge has the label `"ARG0"`.
If no matching node can be found, the graph's NoneNode is returned.
"""
return self.headed(i=i, label=label)
def tail(self, i=None, label=None) -> "Node":
"""Get the tail of the first matching edge, searching by index, label,
both or neither.
If no matching node can be found, the graph's NoneNode is returned.
"""
return self.tailed(i=i, label=label).tail
@ -171,7 +170,7 @@ cdef class Node:
cdef vector[int] edge_indices
self._find_edges(edge_indices, "head", label)
return [Node(self.graph, self.graph.c.edges[i].head) for i in edge_indices]
def tails(self, label=None) -> List["Node"]:
"""Find all matching tails of this node."""
cdef vector[int] edge_indices
@ -200,7 +199,7 @@ cdef class Node:
return NoneEdge(self.graph)
else:
return Edge(self.graph, idx)
def tailed(self, i=None, label=None) -> Edge:
"""Find the first matching edge tailed by this node.
If no matching edge can be found, the graph's NoneEdge is returned.
@ -283,7 +282,7 @@ cdef class NoneEdge(Edge):
def __init__(self, graph):
self.graph = graph
self.i = -1
@property
def doc(self) -> "Doc":
return self.graph.doc
@ -291,7 +290,7 @@ cdef class NoneEdge(Edge):
@property
def head(self) -> "NoneNode":
return NoneNode(self.graph)
@property
def tail(self) -> "NoneNode":
return NoneNode(self.graph)
@ -319,7 +318,7 @@ cdef class NoneNode(Node):
def __len__(self):
return 0
@property
def is_none(self):
return -1
@ -340,14 +339,14 @@ cdef class NoneNode(Node):
def walk_heads(self):
yield from []
def walk_tails(self):
yield from []
cdef class Graph:
"""A set of directed labelled relationships between sets of tokens.
EXAMPLE:
Construction 1
>>> graph = Graph(doc, name="srl")
@ -372,7 +371,9 @@ cdef class Graph:
>>> assert graph.has_node((0,))
>>> assert graph.has_edge((0,), (1,3), label="agent")
"""
def __init__(self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None):
def __init__(
self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None # no-cython-lint
):
"""Create a Graph object.
doc (Doc): The Doc object the graph will refer to.
@ -438,13 +439,11 @@ cdef class Graph:
def add_edge(self, head, tail, *, label="", weight=None) -> Edge:
"""Add an edge to the graph, connecting two groups of tokens.
If there is already an edge for the (head, tail, label) triple, it will
be returned, and no new edge will be created. The weight of the edge
will be updated if a weight is specified.
"""
label_hash = self.doc.vocab.strings.as_int(label)
weight_float = weight if weight is not None else 0.0
edge_index = add_edge(
&self.c,
EdgeC(
@ -478,11 +477,11 @@ cdef class Graph:
def has_edge(self, head, tail, label) -> bool:
"""Check whether a (head, tail, label) triple is an edge in the graph."""
return not self.get_edge(head, tail, label=label).is_none
def add_node(self, indices) -> Node:
"""Add a node to the graph and return it. Nodes refer to ordered sets
of token indices.
This method is idempotent: if there is already a node for the given
indices, it is returned without a new node being created.
"""
@ -510,7 +509,7 @@ cdef class Graph:
return NoneNode(self)
else:
return Node(self, node_index)
def has_node(self, tuple indices) -> bool:
"""Check whether the graph has a node for the given indices."""
return not self.get_node(indices).is_none
@ -570,7 +569,7 @@ cdef int add_node(GraphC* graph, vector[int32_t]& node) nogil:
graph.roots.insert(index)
graph.node_map.insert(pair[hash_t, int](key, index))
return index
cdef int get_node(const GraphC* graph, vector[int32_t] node) nogil:
key = hash64(&node[0], node.size() * sizeof(node[0]), 0)

View File

@ -89,4 +89,3 @@ cdef class MorphAnalysis:
def __repr__(self):
return self.to_json()

View File

@ -1,5 +1,4 @@
cimport numpy as np
from libc.math cimport sqrt
import copy
import warnings
@ -10,11 +9,10 @@ from thinc.api import get_array_module
from ..attrs cimport *
from ..attrs cimport ORTH, attr_id_t
from ..lexeme cimport Lexeme
from ..parts_of_speech cimport univ_pos_t
from ..structs cimport LexemeC, TokenC
from ..structs cimport TokenC
from ..symbols cimport dep
from ..typedefs cimport attr_t, flags_t, hash_t
from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
from ..typedefs cimport attr_t, hash_t
from .doc cimport _get_lca_matrix, get_token_attr
from .token cimport Token
from ..errors import Errors, Warnings
@ -595,7 +593,6 @@ cdef class Span:
"""
return "".join([t.text_with_ws for t in self])
@property
def noun_chunks(self):
"""Iterate over the base noun phrases in the span. Yields base

View File

@ -1,7 +1,7 @@
import struct
import weakref
from copy import deepcopy
from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Union
from typing import Iterable, Optional, Union
import srsly
@ -34,7 +34,7 @@ cdef class SpanGroup:
DOCS: https://spacy.io/api/spangroup
"""
def __init__(self, doc, *, name="", attrs={}, spans=[]):
def __init__(self, doc, *, name="", attrs={}, spans=[]): # no-cython-lint
"""Create a SpanGroup.
doc (Doc): The reference Doc object.
@ -311,7 +311,7 @@ cdef class SpanGroup:
other_attrs = deepcopy(other_group.attrs)
span_group.attrs.update({
key: value for key, value in other_attrs.items() \
key: value for key, value in other_attrs.items()
if key not in span_group.attrs
})
if len(other_group):

View File

@ -26,7 +26,7 @@ cdef class Token:
cdef Token self = Token.__new__(Token, vocab, doc, offset)
return self
#cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
# cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
# cdef TokenC token
# attrs = normalize_attrs(attrs)
@ -98,12 +98,10 @@ cdef class Token:
elif feat_name == SENT_START:
token.sent_start = value
@staticmethod
cdef inline int missing_dep(const TokenC* token) nogil:
return token.dep == MISSING_DEP
@staticmethod
cdef inline int missing_head(const TokenC* token) nogil:
return Token.missing_dep(token)

View File

@ -1,13 +1,11 @@
# cython: infer_types=True
# Compiler crashes on memory view coercion without this. Should report bug.
cimport numpy as np
from cython.view cimport array as cvarray
np.import_array()
import warnings
import numpy
from thinc.api import get_array_module
from ..attrs cimport (
@ -238,7 +236,7 @@ cdef class Token:
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
def has_morph(self):
"""Check whether the token has annotated morph information.
Return False when the morph annotation is unset/missing.
@ -545,9 +543,9 @@ cdef class Token:
def __get__(self):
if self.i + 1 == len(self.doc):
return True
elif self.doc[self.i+1].is_sent_start == None:
elif self.doc[self.i+1].is_sent_start is None:
return None
elif self.doc[self.i+1].is_sent_start == True:
elif self.doc[self.i+1].is_sent_start is True:
return True
else:
return False