mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 04:40:20 +03:00
Parser StateC
optimizations (#10746)
* `StateC`: Optimizations Avoid GIL acquisition in `__init__` Increase default buffer capacities on init Reduce C++ exception overhead * Fix typo * Replace `set::count` with `set::find` * Add exception attribute to c'tor * Remove unused import * Use a power-of-two value for initial capacity Use default-insert to init `_heads` and `_unshiftable` * Merge `cdef` variable declarations and assignments
This commit is contained in:
parent
e3a5350540
commit
7b06c6ca32
|
@ -6,7 +6,6 @@ cimport libcpp
|
||||||
from libcpp.unordered_map cimport unordered_map
|
from libcpp.unordered_map cimport unordered_map
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from libcpp.set cimport set
|
from libcpp.set cimport set
|
||||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
from ...vocab cimport EMPTY_LEXEME
|
from ...vocab cimport EMPTY_LEXEME
|
||||||
|
@ -26,7 +25,7 @@ cdef struct ArcC:
|
||||||
|
|
||||||
|
|
||||||
cdef cppclass StateC:
|
cdef cppclass StateC:
|
||||||
int* _heads
|
vector[int] _heads
|
||||||
const TokenC* _sent
|
const TokenC* _sent
|
||||||
vector[int] _stack
|
vector[int] _stack
|
||||||
vector[int] _rebuffer
|
vector[int] _rebuffer
|
||||||
|
@ -41,25 +40,27 @@ cdef cppclass StateC:
|
||||||
int offset
|
int offset
|
||||||
int _b_i
|
int _b_i
|
||||||
|
|
||||||
__init__(const TokenC* sent, int length) nogil:
|
__init__(const TokenC* sent, int length) nogil except +:
|
||||||
|
this._heads.resize(length, -1)
|
||||||
|
this._unshiftable.resize(length, False)
|
||||||
|
|
||||||
|
# Reserve memory ahead of time to minimize allocations during parsing.
|
||||||
|
# The initial capacity set here ideally reflects the expected average-case/majority usage.
|
||||||
|
cdef int init_capacity = 32
|
||||||
|
this._stack.reserve(init_capacity)
|
||||||
|
this._rebuffer.reserve(init_capacity)
|
||||||
|
this._ents.reserve(init_capacity)
|
||||||
|
this._left_arcs.reserve(init_capacity)
|
||||||
|
this._right_arcs.reserve(init_capacity)
|
||||||
|
this.history.reserve(init_capacity)
|
||||||
|
|
||||||
this._sent = sent
|
this._sent = sent
|
||||||
this._heads = <int*>calloc(length, sizeof(int))
|
|
||||||
if not (this._sent and this._heads):
|
|
||||||
with gil:
|
|
||||||
PyErr_SetFromErrno(MemoryError)
|
|
||||||
PyErr_CheckSignals()
|
|
||||||
this.offset = 0
|
this.offset = 0
|
||||||
this.length = length
|
this.length = length
|
||||||
this._b_i = 0
|
this._b_i = 0
|
||||||
for i in range(length):
|
|
||||||
this._heads[i] = -1
|
|
||||||
this._unshiftable.push_back(0)
|
|
||||||
memset(&this._empty_token, 0, sizeof(TokenC))
|
memset(&this._empty_token, 0, sizeof(TokenC))
|
||||||
this._empty_token.lex = &EMPTY_LEXEME
|
this._empty_token.lex = &EMPTY_LEXEME
|
||||||
|
|
||||||
__dealloc__():
|
|
||||||
free(this._heads)
|
|
||||||
|
|
||||||
void set_context_tokens(int* ids, int n) nogil:
|
void set_context_tokens(int* ids, int n) nogil:
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
if n == 1:
|
if n == 1:
|
||||||
|
@ -132,19 +133,20 @@ cdef cppclass StateC:
|
||||||
ids[i] = -1
|
ids[i] = -1
|
||||||
|
|
||||||
int S(int i) nogil const:
|
int S(int i) nogil const:
|
||||||
if i >= this._stack.size():
|
cdef int stack_size = this._stack.size()
|
||||||
|
if i >= stack_size or i < 0:
|
||||||
return -1
|
return -1
|
||||||
elif i < 0:
|
else:
|
||||||
return -1
|
return this._stack[stack_size - (i+1)]
|
||||||
return this._stack.at(this._stack.size() - (i+1))
|
|
||||||
|
|
||||||
int B(int i) nogil const:
|
int B(int i) nogil const:
|
||||||
|
cdef int buf_size = this._rebuffer.size()
|
||||||
if i < 0:
|
if i < 0:
|
||||||
return -1
|
return -1
|
||||||
elif i < this._rebuffer.size():
|
elif i < buf_size:
|
||||||
return this._rebuffer.at(this._rebuffer.size() - (i+1))
|
return this._rebuffer[buf_size - (i+1)]
|
||||||
else:
|
else:
|
||||||
b_i = this._b_i + (i - this._rebuffer.size())
|
b_i = this._b_i + (i - buf_size)
|
||||||
if b_i >= this.length:
|
if b_i >= this.length:
|
||||||
return -1
|
return -1
|
||||||
else:
|
else:
|
||||||
|
@ -243,7 +245,7 @@ cdef cppclass StateC:
|
||||||
return 0
|
return 0
|
||||||
elif this._sent[word].sent_start == 1:
|
elif this._sent[word].sent_start == 1:
|
||||||
return 1
|
return 1
|
||||||
elif this._sent_starts.count(word) >= 1:
|
elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
|
||||||
return 1
|
return 1
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
@ -328,7 +330,7 @@ cdef cppclass StateC:
|
||||||
if item >= this._unshiftable.size():
|
if item >= this._unshiftable.size():
|
||||||
return 0
|
return 0
|
||||||
else:
|
else:
|
||||||
return this._unshiftable.at(item)
|
return this._unshiftable[item]
|
||||||
|
|
||||||
void set_reshiftable(int item) nogil:
|
void set_reshiftable(int item) nogil:
|
||||||
if item < this._unshiftable.size():
|
if item < this._unshiftable.size():
|
||||||
|
@ -348,6 +350,9 @@ cdef cppclass StateC:
|
||||||
this._heads[child] = head
|
this._heads[child] = head
|
||||||
|
|
||||||
void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
|
void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
|
||||||
|
cdef vector[ArcC]* arcs
|
||||||
|
cdef ArcC* arc
|
||||||
|
|
||||||
arcs_it = heads_arcs.find(h_i)
|
arcs_it = heads_arcs.find(h_i)
|
||||||
if arcs_it == heads_arcs.end():
|
if arcs_it == heads_arcs.end():
|
||||||
return
|
return
|
||||||
|
@ -356,12 +361,12 @@ cdef cppclass StateC:
|
||||||
if arcs.size() == 0:
|
if arcs.size() == 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
arc = arcs.back()
|
arc = &arcs.back()
|
||||||
if arc.head == h_i and arc.child == c_i:
|
if arc.head == h_i and arc.child == c_i:
|
||||||
arcs.pop_back()
|
arcs.pop_back()
|
||||||
else:
|
else:
|
||||||
for i in range(arcs.size()-1):
|
for i in range(arcs.size()-1):
|
||||||
arc = arcs.at(i)
|
arc = &deref(arcs)[i]
|
||||||
if arc.head == h_i and arc.child == c_i:
|
if arc.head == h_i and arc.child == c_i:
|
||||||
arc.head = -1
|
arc.head = -1
|
||||||
arc.child = -1
|
arc.child = -1
|
||||||
|
@ -401,7 +406,7 @@ cdef cppclass StateC:
|
||||||
this._rebuffer = src._rebuffer
|
this._rebuffer = src._rebuffer
|
||||||
this._sent_starts = src._sent_starts
|
this._sent_starts = src._sent_starts
|
||||||
this._unshiftable = src._unshiftable
|
this._unshiftable = src._unshiftable
|
||||||
memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
|
this._heads = src._heads
|
||||||
this._ents = src._ents
|
this._ents = src._ents
|
||||||
this._left_arcs = src._left_arcs
|
this._left_arcs = src._left_arcs
|
||||||
this._right_arcs = src._right_arcs
|
this._right_arcs = src._right_arcs
|
||||||
|
|
Loading…
Reference in New Issue
Block a user