mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-04 20:30:24 +03:00
Parser StateC
optimizations (#10746)
* `StateC`: Optimizations Avoid GIL acquisition in `__init__` Increase default buffer capacities on init Reduce C++ exception overhead * Fix typo * Replace `set::count` with `set::find` * Add exception attribute to c'tor * Remove unused import * Use a power-of-two value for initial capacity Use default-insert to init `_heads` and `_unshiftable` * Merge `cdef` variable declarations and assignments
This commit is contained in:
parent
e3a5350540
commit
7b06c6ca32
|
@ -6,7 +6,6 @@ cimport libcpp
|
|||
from libcpp.unordered_map cimport unordered_map
|
||||
from libcpp.vector cimport vector
|
||||
from libcpp.set cimport set
|
||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from ...vocab cimport EMPTY_LEXEME
|
||||
|
@ -26,7 +25,7 @@ cdef struct ArcC:
|
|||
|
||||
|
||||
cdef cppclass StateC:
|
||||
int* _heads
|
||||
vector[int] _heads
|
||||
const TokenC* _sent
|
||||
vector[int] _stack
|
||||
vector[int] _rebuffer
|
||||
|
@ -41,25 +40,27 @@ cdef cppclass StateC:
|
|||
int offset
|
||||
int _b_i
|
||||
|
||||
__init__(const TokenC* sent, int length) nogil:
|
||||
__init__(const TokenC* sent, int length) nogil except +:
|
||||
this._heads.resize(length, -1)
|
||||
this._unshiftable.resize(length, False)
|
||||
|
||||
# Reserve memory ahead of time to minimize allocations during parsing.
|
||||
# The initial capacity set here ideally reflects the expected average-case/majority usage.
|
||||
cdef int init_capacity = 32
|
||||
this._stack.reserve(init_capacity)
|
||||
this._rebuffer.reserve(init_capacity)
|
||||
this._ents.reserve(init_capacity)
|
||||
this._left_arcs.reserve(init_capacity)
|
||||
this._right_arcs.reserve(init_capacity)
|
||||
this.history.reserve(init_capacity)
|
||||
|
||||
this._sent = sent
|
||||
this._heads = <int*>calloc(length, sizeof(int))
|
||||
if not (this._sent and this._heads):
|
||||
with gil:
|
||||
PyErr_SetFromErrno(MemoryError)
|
||||
PyErr_CheckSignals()
|
||||
this.offset = 0
|
||||
this.length = length
|
||||
this._b_i = 0
|
||||
for i in range(length):
|
||||
this._heads[i] = -1
|
||||
this._unshiftable.push_back(0)
|
||||
memset(&this._empty_token, 0, sizeof(TokenC))
|
||||
this._empty_token.lex = &EMPTY_LEXEME
|
||||
|
||||
__dealloc__():
|
||||
free(this._heads)
|
||||
|
||||
void set_context_tokens(int* ids, int n) nogil:
|
||||
cdef int i, j
|
||||
if n == 1:
|
||||
|
@ -132,19 +133,20 @@ cdef cppclass StateC:
|
|||
ids[i] = -1
|
||||
|
||||
int S(int i) nogil const:
|
||||
if i >= this._stack.size():
|
||||
cdef int stack_size = this._stack.size()
|
||||
if i >= stack_size or i < 0:
|
||||
return -1
|
||||
elif i < 0:
|
||||
return -1
|
||||
return this._stack.at(this._stack.size() - (i+1))
|
||||
else:
|
||||
return this._stack[stack_size - (i+1)]
|
||||
|
||||
int B(int i) nogil const:
|
||||
cdef int buf_size = this._rebuffer.size()
|
||||
if i < 0:
|
||||
return -1
|
||||
elif i < this._rebuffer.size():
|
||||
return this._rebuffer.at(this._rebuffer.size() - (i+1))
|
||||
elif i < buf_size:
|
||||
return this._rebuffer[buf_size - (i+1)]
|
||||
else:
|
||||
b_i = this._b_i + (i - this._rebuffer.size())
|
||||
b_i = this._b_i + (i - buf_size)
|
||||
if b_i >= this.length:
|
||||
return -1
|
||||
else:
|
||||
|
@ -243,7 +245,7 @@ cdef cppclass StateC:
|
|||
return 0
|
||||
elif this._sent[word].sent_start == 1:
|
||||
return 1
|
||||
elif this._sent_starts.count(word) >= 1:
|
||||
elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
@ -328,7 +330,7 @@ cdef cppclass StateC:
|
|||
if item >= this._unshiftable.size():
|
||||
return 0
|
||||
else:
|
||||
return this._unshiftable.at(item)
|
||||
return this._unshiftable[item]
|
||||
|
||||
void set_reshiftable(int item) nogil:
|
||||
if item < this._unshiftable.size():
|
||||
|
@ -348,6 +350,9 @@ cdef cppclass StateC:
|
|||
this._heads[child] = head
|
||||
|
||||
void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
|
||||
cdef vector[ArcC]* arcs
|
||||
cdef ArcC* arc
|
||||
|
||||
arcs_it = heads_arcs.find(h_i)
|
||||
if arcs_it == heads_arcs.end():
|
||||
return
|
||||
|
@ -356,12 +361,12 @@ cdef cppclass StateC:
|
|||
if arcs.size() == 0:
|
||||
return
|
||||
|
||||
arc = arcs.back()
|
||||
arc = &arcs.back()
|
||||
if arc.head == h_i and arc.child == c_i:
|
||||
arcs.pop_back()
|
||||
else:
|
||||
for i in range(arcs.size()-1):
|
||||
arc = arcs.at(i)
|
||||
arc = &deref(arcs)[i]
|
||||
if arc.head == h_i and arc.child == c_i:
|
||||
arc.head = -1
|
||||
arc.child = -1
|
||||
|
@ -401,7 +406,7 @@ cdef cppclass StateC:
|
|||
this._rebuffer = src._rebuffer
|
||||
this._sent_starts = src._sent_starts
|
||||
this._unshiftable = src._unshiftable
|
||||
memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
|
||||
this._heads = src._heads
|
||||
this._ents = src._ents
|
||||
this._left_arcs = src._left_arcs
|
||||
this._right_arcs = src._right_arcs
|
||||
|
|
Loading…
Reference in New Issue
Block a user