Fix lexeme.pyx, parts_of_speech.pxd, vectors.pyx. Temporarily disable cython-lint execution.

This commit is contained in:
Raphael Mitsch 2023-07-04 09:03:56 +02:00
parent be59846ae3
commit 09d1a332a1
4 changed files with 19 additions and 18 deletions

View File

@ -48,7 +48,7 @@ jobs:
- name: cython-lint - name: cython-lint
run: | run: |
python -m pip install cython-lint -c requirements.txt python -m pip install cython-lint -c requirements.txt
cython-lint spacy --ignore E501,W291,E266 # cython-lint spacy --ignore E501,W291,E266
tests: tests:
name: Test name: Test

View File

@ -1,7 +1,6 @@
# cython: embedsignature=True # cython: embedsignature=True
# Compiler crashes on memory view coercion without this. Should report bug. # Compiler crashes on memory view coercion without this. Should report bug.
cimport numpy as np cimport numpy as np
from cython.view cimport array as cvarray
from libc.string cimport memset from libc.string cimport memset
np.import_array() np.import_array()

View File

@ -8,7 +8,7 @@ cpdef enum univ_pos_t:
ADV ADV
AUX AUX
CONJ CONJ
CCONJ # U20 CCONJ # U20
DET DET
INTJ INTJ
NOUN NOUN

View File

@ -1,10 +1,8 @@
cimport numpy as np
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from libc.stdint cimport uint32_t, uint64_t from libc.stdint cimport uint32_t, uint64_t
from libcpp.set cimport set as cppset from libcpp.set cimport set as cppset
from murmurhash.mrmr cimport hash128_x64 from murmurhash.mrmr cimport hash128_x64
import functools
import warnings import warnings
from enum import Enum from enum import Enum
from typing import cast from typing import cast
@ -119,7 +117,7 @@ cdef class Vectors:
if self.mode == Mode.default: if self.mode == Mode.default:
if data is None: if data is None:
if shape is None: if shape is None:
shape = (0,0) shape = (0, 0)
ops = get_current_ops() ops = get_current_ops()
data = ops.xp.zeros(shape, dtype="f") data = ops.xp.zeros(shape, dtype="f")
self._unset = cppset[int]({i for i in range(data.shape[0])}) self._unset = cppset[int]({i for i in range(data.shape[0])})
@ -260,11 +258,10 @@ cdef class Vectors:
def __eq__(self, other): def __eq__(self, other):
# Check for equality, with faster checks first # Check for equality, with faster checks first
return ( return (
self.shape == other.shape self.shape == other.shape
and self.key2row == other.key2row and self.key2row == other.key2row
and self.to_bytes(exclude=["strings"]) and self.to_bytes(exclude=["strings"]) == other.to_bytes(exclude=["strings"])
== other.to_bytes(exclude=["strings"]) )
)
def resize(self, shape, inplace=False): def resize(self, shape, inplace=False):
"""Resize the underlying vectors array. If inplace=True, the memory """Resize the underlying vectors array. If inplace=True, the memory
@ -520,11 +517,12 @@ cdef class Vectors:
# vectors e.g. (10000, 300) # vectors e.g. (10000, 300)
# sims e.g. (1024, 10000) # sims e.g. (1024, 10000)
sims = xp.dot(batch, vectors.T) sims = xp.dot(batch, vectors.T)
best_rows[i:i+batch_size] = xp.argpartition(sims, -n, axis=1)[:,-n:] best_rows[i:i+batch_size] = xp.argpartition(sims, -n, axis=1)[:, -n:]
scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:,-n:] scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:, -n:]
if sort and n >= 2: if sort and n >= 2:
sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1] sorted_index = xp.arange(scores.shape[0])[:, None][i:i+batch_size], \
xp.argsort(scores[i:i+batch_size], axis=1)[:, ::-1]
scores[i:i+batch_size] = scores[sorted_index] scores[i:i+batch_size] = scores[sorted_index]
best_rows[i:i+batch_size] = best_rows[sorted_index] best_rows[i:i+batch_size] = best_rows[sorted_index]
@ -538,8 +536,12 @@ cdef class Vectors:
numpy_rows = get_current_ops().to_numpy(best_rows) numpy_rows = get_current_ops().to_numpy(best_rows)
keys = xp.asarray( keys = xp.asarray(
[[row2key[row] for row in numpy_rows[i] if row in row2key] [
for i in range(len(queries)) ], dtype="uint64") [row2key[row] for row in numpy_rows[i] if row in row2key]
for i in range(len(queries))
],
dtype="uint64"
)
return (keys, best_rows, scores) return (keys, best_rows, scores)
def to_ops(self, ops: Ops): def to_ops(self, ops: Ops):
@ -582,9 +584,9 @@ cdef class Vectors:
""" """
xp = get_array_module(self.data) xp = get_array_module(self.data)
if xp is numpy: if xp is numpy:
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) # no-cython-lint
else: else:
save_array = lambda arr, file_: xp.save(file_, arr) save_array = lambda arr, file_: xp.save(file_, arr) # no-cython-lint
def save_vectors(path): def save_vectors(path):
# the source of numpy.save indicates that the file object is closed after use. # the source of numpy.save indicates that the file object is closed after use.