mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-29 18:23:06 +03:00
cython fixes and cleanup
This commit is contained in:
parent
846472129c
commit
96f2e30c4b
|
@ -158,7 +158,6 @@ cdef class PhraseMatcher:
|
|||
del self._callbacks[key]
|
||||
del self._docs[key]
|
||||
|
||||
|
||||
def _add_from_arrays(self, key, specs, *, on_match=None):
|
||||
"""Add a preprocessed list of specs, with an optional callback.
|
||||
|
||||
|
@ -194,7 +193,6 @@ cdef class PhraseMatcher:
|
|||
result = internal_node
|
||||
map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
|
||||
|
||||
|
||||
def add(self, key, docs, *, on_match=None):
|
||||
"""Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
|
||||
key, a list of one or more patterns, and (optionally) an on_match callback.
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# cython: infer_types=True, cdivision=True, boundscheck=False
|
||||
from typing import Any, List, Optional, Tuple, TypeVar, cast
|
||||
from typing import Any, List, Optional, Tuple, cast
|
||||
|
||||
from libc.stdlib cimport calloc, free, realloc
|
||||
from libc.string cimport memcpy, memset
|
||||
|
@ -23,7 +23,7 @@ from thinc.api import (
|
|||
|
||||
from thinc.backends.cblas cimport CBlas, saxpy, sgemm
|
||||
|
||||
from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
|
||||
from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
|
||||
|
||||
from ..errors import Errors
|
||||
from ..pipeline._parser_internals import _beam_utils
|
||||
|
@ -136,7 +136,7 @@ def init(
|
|||
Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
|
||||
):
|
||||
if X is not None:
|
||||
docs, moves = X
|
||||
docs, _ = X
|
||||
model.get_ref("tok2vec").initialize(X=docs)
|
||||
else:
|
||||
model.get_ref("tok2vec").initialize()
|
||||
|
@ -145,7 +145,7 @@ def init(
|
|||
current_nO = model.maybe_get_dim("nO")
|
||||
if current_nO is None or current_nO != inferred_nO:
|
||||
model.attrs["resize_output"](model, inferred_nO)
|
||||
nO = model.get_dim("nO")
|
||||
# nO = model.get_dim("nO")
|
||||
nP = model.get_dim("nP")
|
||||
nH = model.get_dim("nH")
|
||||
nI = model.get_dim("nI")
|
||||
|
@ -194,7 +194,8 @@ class TransitionModelInputs:
|
|||
moves: TransitionSystem,
|
||||
actions: Optional[List[Ints1d]] = None,
|
||||
max_moves: int = 0,
|
||||
states: Optional[List[State]]=None):
|
||||
states: Optional[List[State]] = None,
|
||||
):
|
||||
"""
|
||||
actions (Optional[List[Ints1d]]): actions to apply for each Doc.
|
||||
docs (List[Doc]): Docs to predict transition sequences for.
|
||||
|
@ -257,9 +258,10 @@ def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[State
|
|||
|
||||
return (states, scores), backprop
|
||||
|
||||
|
||||
cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
|
||||
WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
|
||||
cdef int i, j
|
||||
cdef int i
|
||||
cdef vector[StateC *] unfinished
|
||||
cdef ActivationsC activations = _alloc_activations(sizes)
|
||||
cdef np.ndarray step_scores
|
||||
|
@ -371,7 +373,7 @@ def _forward_fallback(
|
|||
for clas in set(model.attrs["unseen_classes"]):
|
||||
if (d_scores[:, clas] < 0).any():
|
||||
model.attrs["unseen_classes"].remove(clas)
|
||||
d_scores *= seen_mask == False
|
||||
d_scores *= seen_mask == False # no-cython-lint
|
||||
# Calculate the gradients for the parameters of the output layer.
|
||||
# The weight gemm is (nS, nO) @ (nS, nH).T
|
||||
output.inc_grad("b", d_scores.sum(axis=0))
|
||||
|
@ -617,12 +619,12 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
|
|||
scores[i*n.classes+j] = min_
|
||||
|
||||
|
||||
cdef void _sum_state_features(CBlas cblas, float* output,
|
||||
const float* cached, const int* token_ids, SizesC n) nogil:
|
||||
cdef int idx, b, f, i
|
||||
cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
|
||||
const int* token_ids, SizesC n) nogil:
|
||||
cdef int idx, b, f
|
||||
cdef const float* feature
|
||||
cdef int B = n.states
|
||||
cdef int O = n.hiddens * n.pieces
|
||||
cdef int O = n.hiddens * n.pieces # no-cython-lint
|
||||
cdef int F = n.feats
|
||||
cdef int T = n.tokens
|
||||
padding = cached + (T * F * O)
|
||||
|
@ -637,4 +639,3 @@ cdef void _sum_state_features(CBlas cblas, float* output,
|
|||
feature = &cached[idx]
|
||||
saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
|
||||
token_ids += F
|
||||
|
||||
|
|
|
@ -80,7 +80,6 @@ cdef class Morphology:
|
|||
out.sort(key=lambda x: x[0])
|
||||
return dict(out)
|
||||
|
||||
|
||||
def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
|
||||
norm_feats_string = self.FEATURE_SEP.join([
|
||||
self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
|
||||
|
@ -88,7 +87,6 @@ cdef class Morphology:
|
|||
])
|
||||
return norm_feats_string or self.EMPTY_MORPH
|
||||
|
||||
|
||||
cdef hash_t _add(self, features):
|
||||
"""Insert a morphological analysis in the morphology table, if not
|
||||
already present. The morphological analysis may be provided in the UD
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from cymem.cymem cimport Pool
|
||||
from libc.stdint cimport int32_t
|
||||
from libcpp.memory cimport shared_ptr
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
|
|
|
@ -58,7 +58,6 @@ cdef class Beam:
|
|||
void* extra_args) except -1
|
||||
cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
|
||||
|
||||
|
||||
cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
|
||||
self.scores[i][j] = score
|
||||
self.is_valid[i][j] = is_valid
|
||||
|
|
|
@ -1,11 +1,8 @@
|
|||
# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
|
||||
cimport cython
|
||||
from libc.math cimport exp, log
|
||||
from libc.string cimport memcpy, memset
|
||||
|
||||
import math
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from libc.math cimport exp
|
||||
from libc.string cimport memcpy, memset
|
||||
from preshed.maps cimport PreshMap
|
||||
|
||||
|
||||
|
@ -70,7 +67,7 @@ cdef class Beam:
|
|||
self.costs[i][j] = costs[j]
|
||||
|
||||
cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
|
||||
cdef int i, j
|
||||
cdef int i
|
||||
for i in range(self.width):
|
||||
memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
|
||||
memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
|
||||
|
@ -176,7 +173,6 @@ cdef class Beam:
|
|||
beam-width, and n is the number of classes.
|
||||
"""
|
||||
cdef Entry entry
|
||||
cdef weight_t score
|
||||
cdef _State* s
|
||||
cdef int i, j, move_id
|
||||
assert self.size >= 1
|
||||
|
|
|
@ -319,4 +319,3 @@ cdef void c_transition_batch(TransitionSystem moves, StateC** states, const floa
|
|||
action.do(states[i], action.label)
|
||||
states[i].history.push_back(guess)
|
||||
free(is_valid)
|
||||
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from itertools import islice
|
||||
from typing import Callable, Dict, Iterable, List, Optional, Union
|
||||
from typing import Callable, Dict, Iterable, Optional, Union
|
||||
|
||||
import srsly
|
||||
from thinc.api import Config, Model
|
||||
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
||||
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
import warnings
|
||||
from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
|
||||
|
||||
import srsly
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
|
||||
from ..errors import Errors, Warnings
|
||||
from ..errors import Errors
|
||||
from ..language import Language
|
||||
from ..training import Example
|
||||
from ..util import raise_error
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
import warnings
|
||||
from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
|
||||
|
||||
import srsly
|
||||
|
@ -8,7 +7,7 @@ from thinc.api import Model, Optimizer, set_dropout_rate
|
|||
from ..tokens.doc cimport Doc
|
||||
|
||||
from .. import util
|
||||
from ..errors import Errors, Warnings
|
||||
from ..errors import Errors
|
||||
from ..language import Language
|
||||
from ..training import Example, validate_distillation_examples, validate_examples
|
||||
from ..vocab import Vocab
|
||||
|
@ -56,14 +55,14 @@ cdef class TrainablePipe(Pipe):
|
|||
except Exception as e:
|
||||
error_handler(self.name, self, [doc], e)
|
||||
|
||||
|
||||
def distill(self,
|
||||
teacher_pipe: Optional["TrainablePipe"],
|
||||
examples: Iterable["Example"],
|
||||
*,
|
||||
drop: float = 0.0,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
|
||||
losses: Optional[Dict[str, float]] = None
|
||||
) -> Dict[str, float]:
|
||||
"""Train a pipe (the student) on the predictions of another pipe
|
||||
(the teacher). The student is typically trained on the probability
|
||||
distribution of the teacher, but details may differ per pipe.
|
||||
|
|
|
@ -227,7 +227,8 @@ class Parser(TrainablePipe):
|
|||
*,
|
||||
drop: float = 0.0,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
losses: Optional[Dict[str, float]]=None):
|
||||
losses: Optional[Dict[str, float]] = None
|
||||
):
|
||||
"""Train a pipe (the student) on the predictions of another pipe
|
||||
(the teacher). The student is trained on the transition probabilities
|
||||
of the teacher.
|
||||
|
@ -277,7 +278,9 @@ class Parser(TrainablePipe):
|
|||
# teacher's distributions.
|
||||
|
||||
student_inputs = TransitionModelInputs(docs=student_docs,
|
||||
states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves)
|
||||
states=[state.copy() for state in states],
|
||||
moves=self.moves,
|
||||
max_moves=max_moves)
|
||||
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
|
||||
actions = _states_diff_to_actions(states, student_states)
|
||||
teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
|
||||
|
@ -294,7 +297,6 @@ class Parser(TrainablePipe):
|
|||
|
||||
return losses
|
||||
|
||||
|
||||
def get_teacher_student_loss(
|
||||
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
|
||||
normalize: bool = False,
|
||||
|
@ -436,8 +438,10 @@ class Parser(TrainablePipe):
|
|||
else:
|
||||
init_states, gold_states, _ = self.moves.init_gold_batch(examples)
|
||||
|
||||
inputs = TransitionModelInputs(docs=docs, moves=self.moves,
|
||||
max_moves=max_moves, states=[state.copy() for state in init_states])
|
||||
inputs = TransitionModelInputs(docs=docs,
|
||||
moves=self.moves,
|
||||
max_moves=max_moves,
|
||||
states=[state.copy() for state in init_states])
|
||||
(pred_states, scores), backprop_scores = self.model.begin_update(inputs)
|
||||
if sum(s.shape[0] for s in scores) == 0:
|
||||
return losses
|
||||
|
@ -483,9 +487,7 @@ class Parser(TrainablePipe):
|
|||
cdef TransitionSystem moves = self.moves
|
||||
cdef StateClass state
|
||||
cdef int clas
|
||||
cdef int nF = self.model.get_dim("nF")
|
||||
cdef int nO = moves.n_moves
|
||||
cdef int nS = sum([len(history) for history in histories])
|
||||
cdef Pool mem = Pool()
|
||||
cdef np.ndarray costs_i
|
||||
is_valid = <int*>mem.alloc(nO, sizeof(int))
|
||||
|
@ -552,8 +554,8 @@ class Parser(TrainablePipe):
|
|||
|
||||
return losses
|
||||
|
||||
def update_beam(self, examples, *, beam_width,
|
||||
drop=0., sgd=None, losses=None, beam_density=0.0):
|
||||
def update_beam(self, examples, *, beam_width, drop=0.,
|
||||
sgd=None, losses=None, beam_density=0.0):
|
||||
raise NotImplementedError
|
||||
|
||||
def set_output(self, nO):
|
||||
|
@ -678,7 +680,8 @@ class Parser(TrainablePipe):
|
|||
return states
|
||||
|
||||
# Parse the states that are too long with the teacher's parsing model.
|
||||
teacher_inputs = TransitionModelInputs(docs=docs, moves=moves,
|
||||
teacher_inputs = TransitionModelInputs(docs=docs,
|
||||
moves=moves,
|
||||
states=[state.copy() for state in to_cut])
|
||||
(teacher_states, _) = teacher_pipe.model.predict(teacher_inputs)
|
||||
|
||||
|
@ -778,6 +781,7 @@ def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
|
|||
|
||||
return actions
|
||||
|
||||
|
||||
def _states_diff_to_actions(
|
||||
before_states: List[StateClass],
|
||||
after_states: List[StateClass]
|
||||
|
@ -798,7 +802,8 @@ def _states_diff_to_actions(
|
|||
c_state_before = before_state.c
|
||||
c_state_after = after_state.c
|
||||
|
||||
assert equal(c_state_before.history.begin(), c_state_before.history.end(),
|
||||
assert equal(c_state_before.history.begin(),
|
||||
c_state_before.history.end(),
|
||||
c_state_after.history.begin())
|
||||
|
||||
actions = []
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# cython: infer_types=True
|
||||
from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Union
|
||||
from typing import Iterable, Iterator, List, Optional, Tuple, Union
|
||||
|
||||
cimport cython
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.string cimport memcpy
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
@ -243,7 +242,6 @@ cdef class StringStore:
|
|||
cdef int n_length_bytes
|
||||
cdef int i
|
||||
cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str))
|
||||
cdef uint32_t ulength = length
|
||||
if length < sizeof(string.s):
|
||||
string.s[0] = <unsigned char>length
|
||||
memcpy(&string.s[1], chars, length)
|
||||
|
@ -301,7 +299,7 @@ cpdef hash_t get_string_id(object string_or_hash) except -1:
|
|||
|
||||
try:
|
||||
return hash_string(string_or_hash)
|
||||
except:
|
||||
except: # no-cython-lint
|
||||
if _try_coerce_to_hash(string_or_hash, &str_hash):
|
||||
# Coerce the integral key to the expected primitive hash type.
|
||||
# This ensures that custom/overloaded "primitive" data types
|
||||
|
@ -318,6 +316,5 @@ cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
|
|||
try:
|
||||
out_hash[0] = key
|
||||
return True
|
||||
except:
|
||||
except: # no-cython-lint
|
||||
return False
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
|
||||
from spacy.typedefs cimport class_t, weight_t
|
||||
from spacy.typedefs cimport class_t
|
||||
|
||||
import pytest
|
||||
|
||||
|
@ -42,6 +42,7 @@ cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
|
|||
state = <TestState*>state
|
||||
mem.free(state)
|
||||
|
||||
|
||||
@cytest
|
||||
@pytest.mark.parametrize("nr_class,beam_width",
|
||||
[
|
||||
|
@ -56,10 +57,12 @@ def test_init(nr_class, beam_width):
|
|||
assert b.width == beam_width
|
||||
assert b.nr_class == nr_class
|
||||
|
||||
|
||||
@cytest
|
||||
def test_init_violn():
|
||||
MaxViolation()
|
||||
|
||||
|
||||
@cytest
|
||||
@pytest.mark.parametrize("nr_class,beam_width,length",
|
||||
[
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
cimport numpy as np
|
||||
from libc.string cimport memset
|
||||
|
||||
from ..errors import Errors
|
||||
from ..morphology import Morphology
|
||||
|
|
|
@ -225,8 +225,8 @@ cdef class Span:
|
|||
|
||||
@property
|
||||
def _(self):
|
||||
cdef SpanC* span_c = self.span_c()
|
||||
"""Custom extension attributes registered via `set_extension`."""
|
||||
cdef SpanC* span_c = self.span_c()
|
||||
return Underscore(Underscore.span_extensions, self,
|
||||
start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
|
||||
|
||||
|
@ -933,7 +933,6 @@ cdef class Span:
|
|||
self.id_ = ent_id_
|
||||
|
||||
|
||||
|
||||
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
|
||||
# Don't allow spaces to be the root, if there are
|
||||
# better candidates
|
||||
|
|
Loading…
Reference in New Issue
Block a user