Parser StateC optimizations (#10746)

* `StateC`: Optimizations

Avoid GIL acquisition in `__init__`
Increase default buffer capacities on init
Reduce C++ exception overhead

* Fix typo

* Replace `set::count` with `set::find`

* Add exception attribute to c'tor

* Remove unused import

* Use a power-of-two value for initial capacity
Use default-insert to init `_heads` and `_unshiftable`

* Merge `cdef` variable declarations and assignments
This commit is contained in:
Madeesh Kannan 2022-05-09 19:02:35 +02:00 committed by GitHub
parent e3a5350540
commit 7b06c6ca32
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -6,7 +6,6 @@ cimport libcpp
from libcpp.unordered_map cimport unordered_map from libcpp.unordered_map cimport unordered_map
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libcpp.set cimport set from libcpp.set cimport set
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from ...vocab cimport EMPTY_LEXEME from ...vocab cimport EMPTY_LEXEME
@ -26,7 +25,7 @@ cdef struct ArcC:
cdef cppclass StateC: cdef cppclass StateC:
int* _heads vector[int] _heads
const TokenC* _sent const TokenC* _sent
vector[int] _stack vector[int] _stack
vector[int] _rebuffer vector[int] _rebuffer
@ -41,25 +40,27 @@ cdef cppclass StateC:
int offset int offset
int _b_i int _b_i
__init__(const TokenC* sent, int length) nogil: __init__(const TokenC* sent, int length) nogil except +:
this._heads.resize(length, -1)
this._unshiftable.resize(length, False)
# Reserve memory ahead of time to minimize allocations during parsing.
# The initial capacity set here ideally reflects the expected average-case/majority usage.
cdef int init_capacity = 32
this._stack.reserve(init_capacity)
this._rebuffer.reserve(init_capacity)
this._ents.reserve(init_capacity)
this._left_arcs.reserve(init_capacity)
this._right_arcs.reserve(init_capacity)
this.history.reserve(init_capacity)
this._sent = sent this._sent = sent
this._heads = <int*>calloc(length, sizeof(int))
if not (this._sent and this._heads):
with gil:
PyErr_SetFromErrno(MemoryError)
PyErr_CheckSignals()
this.offset = 0 this.offset = 0
this.length = length this.length = length
this._b_i = 0 this._b_i = 0
for i in range(length):
this._heads[i] = -1
this._unshiftable.push_back(0)
memset(&this._empty_token, 0, sizeof(TokenC)) memset(&this._empty_token, 0, sizeof(TokenC))
this._empty_token.lex = &EMPTY_LEXEME this._empty_token.lex = &EMPTY_LEXEME
__dealloc__():
free(this._heads)
void set_context_tokens(int* ids, int n) nogil: void set_context_tokens(int* ids, int n) nogil:
cdef int i, j cdef int i, j
if n == 1: if n == 1:
@ -132,19 +133,20 @@ cdef cppclass StateC:
ids[i] = -1 ids[i] = -1
int S(int i) nogil const: int S(int i) nogil const:
if i >= this._stack.size(): cdef int stack_size = this._stack.size()
if i >= stack_size or i < 0:
return -1 return -1
elif i < 0: else:
return -1 return this._stack[stack_size - (i+1)]
return this._stack.at(this._stack.size() - (i+1))
int B(int i) nogil const: int B(int i) nogil const:
cdef int buf_size = this._rebuffer.size()
if i < 0: if i < 0:
return -1 return -1
elif i < this._rebuffer.size(): elif i < buf_size:
return this._rebuffer.at(this._rebuffer.size() - (i+1)) return this._rebuffer[buf_size - (i+1)]
else: else:
b_i = this._b_i + (i - this._rebuffer.size()) b_i = this._b_i + (i - buf_size)
if b_i >= this.length: if b_i >= this.length:
return -1 return -1
else: else:
@ -243,7 +245,7 @@ cdef cppclass StateC:
return 0 return 0
elif this._sent[word].sent_start == 1: elif this._sent[word].sent_start == 1:
return 1 return 1
elif this._sent_starts.count(word) >= 1: elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
return 1 return 1
else: else:
return 0 return 0
@ -328,7 +330,7 @@ cdef cppclass StateC:
if item >= this._unshiftable.size(): if item >= this._unshiftable.size():
return 0 return 0
else: else:
return this._unshiftable.at(item) return this._unshiftable[item]
void set_reshiftable(int item) nogil: void set_reshiftable(int item) nogil:
if item < this._unshiftable.size(): if item < this._unshiftable.size():
@ -348,6 +350,9 @@ cdef cppclass StateC:
this._heads[child] = head this._heads[child] = head
void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil: void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
cdef vector[ArcC]* arcs
cdef ArcC* arc
arcs_it = heads_arcs.find(h_i) arcs_it = heads_arcs.find(h_i)
if arcs_it == heads_arcs.end(): if arcs_it == heads_arcs.end():
return return
@ -356,12 +361,12 @@ cdef cppclass StateC:
if arcs.size() == 0: if arcs.size() == 0:
return return
arc = arcs.back() arc = &arcs.back()
if arc.head == h_i and arc.child == c_i: if arc.head == h_i and arc.child == c_i:
arcs.pop_back() arcs.pop_back()
else: else:
for i in range(arcs.size()-1): for i in range(arcs.size()-1):
arc = arcs.at(i) arc = &deref(arcs)[i]
if arc.head == h_i and arc.child == c_i: if arc.head == h_i and arc.child == c_i:
arc.head = -1 arc.head = -1
arc.child = -1 arc.child = -1
@ -401,7 +406,7 @@ cdef cppclass StateC:
this._rebuffer = src._rebuffer this._rebuffer = src._rebuffer
this._sent_starts = src._sent_starts this._sent_starts = src._sent_starts
this._unshiftable = src._unshiftable this._unshiftable = src._unshiftable
memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0])) this._heads = src._heads
this._ents = src._ents this._ents = src._ents
this._left_arcs = src._left_arcs this._left_arcs = src._left_arcs
this._right_arcs = src._right_arcs this._right_arcs = src._right_arcs