From 7b06c6ca32c80aa30cb080690ad126c7d5ee4875 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Mon, 9 May 2022 19:02:35 +0200 Subject: [PATCH] Parser `StateC` optimizations (#10746) * `StateC`: Optimizations Avoid GIL acquisition in `__init__` Increase default buffer capacities on init Reduce C++ exception overhead * Fix typo * Replace `set::count` with `set::find` * Add exception attribute to c'tor * Remove unused import * Use a power-of-two value for initial capacity Use default-insert to init `_heads` and `_unshiftable` * Merge `cdef` variable declarations and assignments --- spacy/pipeline/_parser_internals/_state.pxd | 57 +++++++++++---------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd index 10d6ef3c0..bd5d5208c 100644 --- a/spacy/pipeline/_parser_internals/_state.pxd +++ b/spacy/pipeline/_parser_internals/_state.pxd @@ -6,7 +6,6 @@ cimport libcpp from libcpp.unordered_map cimport unordered_map from libcpp.vector cimport vector from libcpp.set cimport set -from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from murmurhash.mrmr cimport hash64 from ...vocab cimport EMPTY_LEXEME @@ -26,7 +25,7 @@ cdef struct ArcC: cdef cppclass StateC: - int* _heads + vector[int] _heads const TokenC* _sent vector[int] _stack vector[int] _rebuffer @@ -41,25 +40,27 @@ cdef cppclass StateC: int offset int _b_i - __init__(const TokenC* sent, int length) nogil: + __init__(const TokenC* sent, int length) nogil except +: + this._heads.resize(length, -1) + this._unshiftable.resize(length, False) + + # Reserve memory ahead of time to minimize allocations during parsing. + # The initial capacity set here ideally reflects the expected average-case/majority usage. + cdef int init_capacity = 32 + this._stack.reserve(init_capacity) + this._rebuffer.reserve(init_capacity) + this._ents.reserve(init_capacity) + this._left_arcs.reserve(init_capacity) + this._right_arcs.reserve(init_capacity) + this.history.reserve(init_capacity) + this._sent = sent - this._heads = calloc(length, sizeof(int)) - if not (this._sent and this._heads): - with gil: - PyErr_SetFromErrno(MemoryError) - PyErr_CheckSignals() this.offset = 0 this.length = length this._b_i = 0 - for i in range(length): - this._heads[i] = -1 - this._unshiftable.push_back(0) memset(&this._empty_token, 0, sizeof(TokenC)) this._empty_token.lex = &EMPTY_LEXEME - __dealloc__(): - free(this._heads) - void set_context_tokens(int* ids, int n) nogil: cdef int i, j if n == 1: @@ -132,19 +133,20 @@ cdef cppclass StateC: ids[i] = -1 int S(int i) nogil const: - if i >= this._stack.size(): + cdef int stack_size = this._stack.size() + if i >= stack_size or i < 0: return -1 - elif i < 0: - return -1 - return this._stack.at(this._stack.size() - (i+1)) + else: + return this._stack[stack_size - (i+1)] int B(int i) nogil const: + cdef int buf_size = this._rebuffer.size() if i < 0: return -1 - elif i < this._rebuffer.size(): - return this._rebuffer.at(this._rebuffer.size() - (i+1)) + elif i < buf_size: + return this._rebuffer[buf_size - (i+1)] else: - b_i = this._b_i + (i - this._rebuffer.size()) + b_i = this._b_i + (i - buf_size) if b_i >= this.length: return -1 else: @@ -243,7 +245,7 @@ cdef cppclass StateC: return 0 elif this._sent[word].sent_start == 1: return 1 - elif this._sent_starts.count(word) >= 1: + elif this._sent_starts.const_find(word) != this._sent_starts.const_end(): return 1 else: return 0 @@ -328,7 +330,7 @@ cdef cppclass StateC: if item >= this._unshiftable.size(): return 0 else: - return this._unshiftable.at(item) + return this._unshiftable[item] void set_reshiftable(int item) nogil: if item < this._unshiftable.size(): @@ -348,6 +350,9 @@ cdef cppclass StateC: this._heads[child] = head void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil: + cdef vector[ArcC]* arcs + cdef ArcC* arc + arcs_it = heads_arcs.find(h_i) if arcs_it == heads_arcs.end(): return @@ -356,12 +361,12 @@ cdef cppclass StateC: if arcs.size() == 0: return - arc = arcs.back() + arc = &arcs.back() if arc.head == h_i and arc.child == c_i: arcs.pop_back() else: for i in range(arcs.size()-1): - arc = arcs.at(i) + arc = &deref(arcs)[i] if arc.head == h_i and arc.child == c_i: arc.head = -1 arc.child = -1 @@ -401,7 +406,7 @@ cdef cppclass StateC: this._rebuffer = src._rebuffer this._sent_starts = src._sent_starts this._unshiftable = src._unshiftable - memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0])) + this._heads = src._heads this._ents = src._ents this._left_arcs = src._left_arcs this._right_arcs = src._right_arcs