mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-14 03:26:24 +03:00
bede11b67c
This patch does a few smallish things that tighten up the training workflow a little, and allow memory use during training to be reduced by letting the GoldCorpus stream data properly. Previously, the parser and entity recognizer read and saved labels as lists, with extra labels noted separately. Lists were used becaue ordering is very important, to ensure that the label-to-class mapping is stable. We now manage labels as nested dictionaries, first keyed by the action, and then keyed by the label. Values are frequencies. The trick is, how do we save new labels? We need to make sure we iterate over these in the same order they're added. Otherwise, we'll get different class IDs, and the model's predictions won't make sense. To allow stable sorting, we map the new labels to negative values. If we have two new labels, they'll be noted as having "frequency" -1 and -2. The next new label will then have "frequency" -3. When we sort by (frequency, label), we then get a stable sort. Storing frequencies then allows us to make the next nice improvement. Previously we had to iterate over the whole training set, to pre-process it for the deprojectivisation. This led to storing the whole training set in memory. This was most of the required memory during training. To prevent this, we now store the frequencies as we stream in the data, and deprojectivize as we go. Once we've built the frequencies, we can then apply a frequency cut-off when we decide how many classes to make. Finally, to allow proper data streaming, we also have to have some way of shuffling the iterator. This is awkward if the training files have multiple documents in them. To solve this, the GoldCorpus class now writes the training data to disk in msgpack files, one per document. We can then shuffle the data by shuffling the paths. This is a squash merge, as I made a lot of very small commits. Individual commit messages below. * Simplify label management for TransitionSystem and its subclasses * Fix serialization for new label handling format in parser * Simplify and improve GoldCorpus class. Reduce memory use, write to temp dir * Set actions in transition system * Require thinc 6.11.1.dev4 * Fix error in parser init * Add unicode declaration * Fix unicode declaration * Update textcat test * Try to get model training on less memory * Print json loc for now * Try rapidjson to reduce memory use * Remove rapidjson requirement * Try rapidjson for reduced mem usage * Handle None heads when projectivising * Stream json docs * Fix train script * Handle projectivity in GoldParse * Fix projectivity handling * Add minibatch_by_words util from ud_train * Minibatch by number of words in spacy.cli.train * Move minibatch_by_words util to spacy.util * Fix label handling * More hacking at label management in parser * Fix encoding in msgpack serialization in GoldParse * Adjust batch sizes in parser training * Fix minibatch_by_words * Add merge_subtokens function to pipeline.pyx * Register merge_subtokens factory * Restore use of msgpack tmp directory * Use minibatch-by-words in train * Handle retokenization in scorer * Change back-off approach for missing labels. Use 'dep' label * Update NER for new label management * Set NER tags for over-segmented words * Fix label alignment in gold * Fix label back-off for infrequent labels * Fix int type in labels dict key * Fix int type in labels dict key * Update feature definition for 8 feature set * Update ud-train script for new label stuff * Fix json streamer * Print the line number if conll eval fails * Update children and sentence boundaries after deprojectivisation * Export set_children_from_heads from doc.pxd * Render parses during UD training * Remove print statement * Require thinc 6.11.1.dev6. Try adding wheel as install_requires * Set different dev version, to flush pip cache * Update thinc version * Update GoldCorpus docs * Remove print statements * Fix formatting and links [ci skip]
477 lines
16 KiB
Cython
477 lines
16 KiB
Cython
from libc.string cimport memcpy, memset, memmove
|
|
from libc.stdlib cimport malloc, calloc, free
|
|
from libc.stdint cimport uint32_t, uint64_t
|
|
|
|
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
|
|
|
from murmurhash.mrmr cimport hash64
|
|
|
|
from ..vocab cimport EMPTY_LEXEME
|
|
from ..structs cimport TokenC, Entity
|
|
from ..lexeme cimport Lexeme
|
|
from ..symbols cimport punct
|
|
from ..attrs cimport IS_SPACE
|
|
from ..typedefs cimport attr_t
|
|
|
|
|
|
cdef inline bint is_space_token(const TokenC* token) nogil:
|
|
return Lexeme.c_check_flag(token.lex, IS_SPACE)
|
|
|
|
cdef struct RingBufferC:
|
|
int[8] data
|
|
int i
|
|
int default
|
|
|
|
cdef inline int ring_push(RingBufferC* ring, int value) nogil:
|
|
ring.data[ring.i] = value
|
|
ring.i += 1
|
|
if ring.i >= 8:
|
|
ring.i = 0
|
|
|
|
cdef inline int ring_get(RingBufferC* ring, int i) nogil:
|
|
if i >= ring.i:
|
|
return ring.default
|
|
else:
|
|
return ring.data[ring.i-i]
|
|
|
|
|
|
cdef cppclass StateC:
|
|
int* _stack
|
|
int* _buffer
|
|
bint* shifted
|
|
TokenC* _sent
|
|
Entity* _ents
|
|
TokenC _empty_token
|
|
RingBufferC _hist
|
|
int length
|
|
int offset
|
|
int _s_i
|
|
int _b_i
|
|
int _e_i
|
|
int _break
|
|
|
|
__init__(const TokenC* sent, int length) nogil:
|
|
cdef int PADDING = 5
|
|
this._buffer = <int*>calloc(length + (PADDING * 2), sizeof(int))
|
|
this._stack = <int*>calloc(length + (PADDING * 2), sizeof(int))
|
|
this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
|
|
this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
|
|
this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
|
|
if not (this._buffer and this._stack and this.shifted
|
|
and this._sent and this._ents):
|
|
with gil:
|
|
PyErr_SetFromErrno(MemoryError)
|
|
PyErr_CheckSignals()
|
|
memset(&this._hist, 0, sizeof(this._hist))
|
|
this.offset = 0
|
|
cdef int i
|
|
for i in range(length + (PADDING * 2)):
|
|
this._ents[i].end = -1
|
|
this._sent[i].l_edge = i
|
|
this._sent[i].r_edge = i
|
|
for i in range(PADDING):
|
|
this._sent[i].lex = &EMPTY_LEXEME
|
|
this._sent += PADDING
|
|
this._ents += PADDING
|
|
this._buffer += PADDING
|
|
this._stack += PADDING
|
|
this.shifted += PADDING
|
|
this.length = length
|
|
this._break = -1
|
|
this._s_i = 0
|
|
this._b_i = 0
|
|
this._e_i = 0
|
|
for i in range(length):
|
|
this._buffer[i] = i
|
|
memset(&this._empty_token, 0, sizeof(TokenC))
|
|
this._empty_token.lex = &EMPTY_LEXEME
|
|
for i in range(length):
|
|
this._sent[i] = sent[i]
|
|
this._buffer[i] = i
|
|
for i in range(length, length+PADDING):
|
|
this._sent[i].lex = &EMPTY_LEXEME
|
|
|
|
__dealloc__():
|
|
cdef int PADDING = 5
|
|
free(this._sent - PADDING)
|
|
free(this._ents - PADDING)
|
|
free(this._buffer - PADDING)
|
|
free(this._stack - PADDING)
|
|
free(this.shifted - PADDING)
|
|
|
|
void set_context_tokens(int* ids, int n) nogil:
|
|
if n == 2:
|
|
ids[0] = this.B(0)
|
|
ids[1] = this.S(0)
|
|
if n == 8:
|
|
ids[0] = this.B(0)
|
|
ids[1] = this.B(1)
|
|
ids[2] = this.S(0)
|
|
ids[3] = this.S(1)
|
|
ids[4] = this.S(2)
|
|
ids[5] = this.L(this.B(0), 1)
|
|
ids[6] = this.L(this.S(0), 1)
|
|
ids[7] = this.R(this.S(0), 1)
|
|
elif n == 13:
|
|
ids[0] = this.B(0)
|
|
ids[1] = this.B(1)
|
|
ids[2] = this.S(0)
|
|
ids[3] = this.S(1)
|
|
ids[4] = this.S(2)
|
|
ids[5] = this.L(this.S(0), 1)
|
|
ids[6] = this.L(this.S(0), 2)
|
|
ids[6] = this.R(this.S(0), 1)
|
|
ids[7] = this.L(this.B(0), 1)
|
|
ids[8] = this.R(this.S(0), 2)
|
|
ids[9] = this.L(this.S(1), 1)
|
|
ids[10] = this.L(this.S(1), 2)
|
|
ids[11] = this.R(this.S(1), 1)
|
|
ids[12] = this.R(this.S(1), 2)
|
|
elif n == 6:
|
|
if this.B(0) >= 0:
|
|
ids[0] = this.B(0)
|
|
ids[1] = this.B(0)-1
|
|
else:
|
|
ids[0] = -1
|
|
ids[1] = -1
|
|
ids[2] = this.B(1)
|
|
ids[3] = this.E(0)
|
|
if ids[3] >= 1:
|
|
ids[4] = this.E(0)-1
|
|
else:
|
|
ids[4] = -1
|
|
if (ids[3]+1) < this.length:
|
|
ids[5] = this.E(0)+1
|
|
else:
|
|
ids[5] = -1
|
|
else:
|
|
# TODO error =/
|
|
pass
|
|
for i in range(n):
|
|
if ids[i] >= 0:
|
|
ids[i] += this.offset
|
|
else:
|
|
ids[i] = -1
|
|
|
|
int S(int i) nogil const:
|
|
if i >= this._s_i:
|
|
return -1
|
|
return this._stack[this._s_i - (i+1)]
|
|
|
|
int B(int i) nogil const:
|
|
if (i + this._b_i) >= this.length:
|
|
return -1
|
|
return this._buffer[this._b_i + i]
|
|
|
|
const TokenC* S_(int i) nogil const:
|
|
return this.safe_get(this.S(i))
|
|
|
|
const TokenC* B_(int i) nogil const:
|
|
return this.safe_get(this.B(i))
|
|
|
|
const TokenC* H_(int i) nogil const:
|
|
return this.safe_get(this.H(i))
|
|
|
|
const TokenC* E_(int i) nogil const:
|
|
return this.safe_get(this.E(i))
|
|
|
|
const TokenC* L_(int i, int idx) nogil const:
|
|
return this.safe_get(this.L(i, idx))
|
|
|
|
const TokenC* R_(int i, int idx) nogil const:
|
|
return this.safe_get(this.R(i, idx))
|
|
|
|
const TokenC* safe_get(int i) nogil const:
|
|
if i < 0 or i >= this.length:
|
|
return &this._empty_token
|
|
else:
|
|
return &this._sent[i]
|
|
|
|
int H(int i) nogil const:
|
|
if i < 0 or i >= this.length:
|
|
return -1
|
|
return this._sent[i].head + i
|
|
|
|
int E(int i) nogil const:
|
|
if this._e_i <= 0 or this._e_i >= this.length:
|
|
return -1
|
|
if i < 0 or i >= this._e_i:
|
|
return -1
|
|
return this._ents[this._e_i - (i+1)].start
|
|
|
|
int L(int i, int idx) nogil const:
|
|
if idx < 1:
|
|
return -1
|
|
if i < 0 or i >= this.length:
|
|
return -1
|
|
cdef const TokenC* target = &this._sent[i]
|
|
if target.l_kids < <uint32_t>idx:
|
|
return -1
|
|
cdef const TokenC* ptr = &this._sent[target.l_edge]
|
|
|
|
while ptr < target:
|
|
# If this head is still to the right of us, we can skip to it
|
|
# No token that's between this token and this head could be our
|
|
# child.
|
|
if (ptr.head >= 1) and (ptr + ptr.head) < target:
|
|
ptr += ptr.head
|
|
|
|
elif ptr + ptr.head == target:
|
|
idx -= 1
|
|
if idx == 0:
|
|
return ptr - this._sent
|
|
ptr += 1
|
|
else:
|
|
ptr += 1
|
|
return -1
|
|
|
|
int R(int i, int idx) nogil const:
|
|
if idx < 1:
|
|
return -1
|
|
if i < 0 or i >= this.length:
|
|
return -1
|
|
cdef const TokenC* target = &this._sent[i]
|
|
if target.r_kids < <uint32_t>idx:
|
|
return -1
|
|
cdef const TokenC* ptr = &this._sent[target.r_edge]
|
|
while ptr > target:
|
|
# If this head is still to the right of us, we can skip to it
|
|
# No token that's between this token and this head could be our
|
|
# child.
|
|
if (ptr.head < 0) and ((ptr + ptr.head) > target):
|
|
ptr += ptr.head
|
|
elif ptr + ptr.head == target:
|
|
idx -= 1
|
|
if idx == 0:
|
|
return ptr - this._sent
|
|
ptr -= 1
|
|
else:
|
|
ptr -= 1
|
|
return -1
|
|
|
|
bint empty() nogil const:
|
|
return this._s_i <= 0
|
|
|
|
bint eol() nogil const:
|
|
return this.buffer_length() == 0
|
|
|
|
bint at_break() nogil const:
|
|
return this._break != -1
|
|
|
|
bint is_final() nogil const:
|
|
return this.stack_depth() <= 0 and this._b_i >= this.length
|
|
|
|
bint has_head(int i) nogil const:
|
|
return this.safe_get(i).head != 0
|
|
|
|
int n_L(int i) nogil const:
|
|
return this.safe_get(i).l_kids
|
|
|
|
int n_R(int i) nogil const:
|
|
return this.safe_get(i).r_kids
|
|
|
|
bint stack_is_connected() nogil const:
|
|
return False
|
|
|
|
bint entity_is_open() nogil const:
|
|
if this._e_i < 1:
|
|
return False
|
|
return this._ents[this._e_i-1].end == -1
|
|
|
|
int stack_depth() nogil const:
|
|
return this._s_i
|
|
|
|
int buffer_length() nogil const:
|
|
if this._break != -1:
|
|
return this._break - this._b_i
|
|
else:
|
|
return this.length - this._b_i
|
|
|
|
uint64_t hash() nogil const:
|
|
cdef TokenC[11] sig
|
|
sig[0] = this.S_(2)[0]
|
|
sig[1] = this.S_(1)[0]
|
|
sig[2] = this.R_(this.S(1), 1)[0]
|
|
sig[3] = this.L_(this.S(0), 1)[0]
|
|
sig[4] = this.L_(this.S(0), 2)[0]
|
|
sig[5] = this.S_(0)[0]
|
|
sig[6] = this.R_(this.S(0), 2)[0]
|
|
sig[7] = this.R_(this.S(0), 1)[0]
|
|
sig[8] = this.B_(0)[0]
|
|
sig[9] = this.E_(0)[0]
|
|
sig[10] = this.E_(1)[0]
|
|
return hash64(sig, sizeof(sig), this._s_i) \
|
|
+ hash64(<void*>&this._hist, sizeof(RingBufferC), 1)
|
|
|
|
void push_hist(int act) nogil:
|
|
ring_push(&this._hist, act+1)
|
|
|
|
int get_hist(int i) nogil:
|
|
return ring_get(&this._hist, i)
|
|
|
|
void push() nogil:
|
|
if this.B(0) != -1:
|
|
this._stack[this._s_i] = this.B(0)
|
|
this._s_i += 1
|
|
this._b_i += 1
|
|
if this.B_(0).sent_start == 1:
|
|
this.set_break(this.B(0))
|
|
if this._b_i > this._break:
|
|
this._break = -1
|
|
|
|
void pop() nogil:
|
|
if this._s_i >= 1:
|
|
this._s_i -= 1
|
|
|
|
void unshift() nogil:
|
|
this._b_i -= 1
|
|
this._buffer[this._b_i] = this.S(0)
|
|
this._s_i -= 1
|
|
this.shifted[this.B(0)] = True
|
|
|
|
void add_arc(int head, int child, attr_t label) nogil:
|
|
if this.has_head(child):
|
|
this.del_arc(this.H(child), child)
|
|
|
|
cdef int dist = head - child
|
|
this._sent[child].head = dist
|
|
this._sent[child].dep = label
|
|
cdef int i
|
|
if child > head:
|
|
this._sent[head].r_kids += 1
|
|
# Some transition systems can have a word in the buffer have a
|
|
# rightward child, e.g. from Unshift.
|
|
this._sent[head].r_edge = this._sent[child].r_edge
|
|
i = 0
|
|
while this.has_head(head) and i < this.length:
|
|
head = this.H(head)
|
|
this._sent[head].r_edge = this._sent[child].r_edge
|
|
i += 1 # Guard against infinite loops
|
|
else:
|
|
this._sent[head].l_kids += 1
|
|
this._sent[head].l_edge = this._sent[child].l_edge
|
|
|
|
void del_arc(int h_i, int c_i) nogil:
|
|
cdef int dist = h_i - c_i
|
|
cdef TokenC* h = &this._sent[h_i]
|
|
cdef int i = 0
|
|
if c_i > h_i:
|
|
# this.R_(h_i, 2) returns the second-rightmost child token of h_i
|
|
# If we have more than 2 rightmost children, our 2nd rightmost child's
|
|
# rightmost edge is going to be our new rightmost edge.
|
|
h.r_edge = this.R_(h_i, 2).r_edge if h.r_kids >= 2 else h_i
|
|
h.r_kids -= 1
|
|
new_edge = h.r_edge
|
|
# Correct upwards in the tree --- see Issue #251
|
|
while h.head < 0 and i < this.length: # Guard infinite loop
|
|
h += h.head
|
|
h.r_edge = new_edge
|
|
i += 1
|
|
else:
|
|
# Same logic applies for left edge, but we don't need to walk up
|
|
# the tree, as the head is off the stack.
|
|
h.l_edge = this.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
|
|
h.l_kids -= 1
|
|
|
|
void open_ent(attr_t label) nogil:
|
|
this._ents[this._e_i].start = this.B(0)
|
|
this._ents[this._e_i].label = label
|
|
this._ents[this._e_i].end = -1
|
|
this._e_i += 1
|
|
|
|
void close_ent() nogil:
|
|
# Note that we don't decrement _e_i here! We want to maintain all
|
|
# entities, not over-write them...
|
|
this._ents[this._e_i-1].end = this.B(0)+1
|
|
this._sent[this.B(0)].ent_iob = 1
|
|
|
|
void set_ent_tag(int i, int ent_iob, attr_t ent_type) nogil:
|
|
if 0 <= i < this.length:
|
|
this._sent[i].ent_iob = ent_iob
|
|
this._sent[i].ent_type = ent_type
|
|
|
|
void set_break(int i) nogil:
|
|
if 0 <= i < this.length:
|
|
this._sent[i].sent_start = 1
|
|
this._break = this._b_i
|
|
|
|
void clone(const StateC* src) nogil:
|
|
this.length = src.length
|
|
memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
|
|
memcpy(this._stack, src._stack, this.length * sizeof(int))
|
|
memcpy(this._buffer, src._buffer, this.length * sizeof(int))
|
|
memcpy(this._ents, src._ents, this.length * sizeof(Entity))
|
|
memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
|
|
this._b_i = src._b_i
|
|
this._s_i = src._s_i
|
|
this._e_i = src._e_i
|
|
this._break = src._break
|
|
this.offset = src.offset
|
|
this._empty_token = src._empty_token
|
|
|
|
void fast_forward() nogil:
|
|
# space token attachement policy:
|
|
# - attach space tokens always to the last preceding real token
|
|
# - except if it's the beginning of a sentence, then attach to the first following
|
|
# - boundary case: a document containing multiple space tokens but nothing else,
|
|
# then make the last space token the head of all others
|
|
|
|
while is_space_token(this.B_(0)) \
|
|
or this.buffer_length() == 0 \
|
|
or this.stack_depth() == 0:
|
|
if this.buffer_length() == 0:
|
|
# remove the last sentence's root from the stack
|
|
if this.stack_depth() == 1:
|
|
this.pop()
|
|
# parser got stuck: reduce stack or unshift
|
|
elif this.stack_depth() > 1:
|
|
if this.has_head(this.S(0)):
|
|
this.pop()
|
|
else:
|
|
this.unshift()
|
|
# stack is empty but there is another sentence on the buffer
|
|
elif (this.length - this._b_i) >= 1:
|
|
this.push()
|
|
else: # stack empty and nothing else coming
|
|
break
|
|
|
|
elif is_space_token(this.B_(0)):
|
|
# the normal case: we're somewhere inside a sentence
|
|
if this.stack_depth() > 0:
|
|
# assert not is_space_token(this.S_(0))
|
|
# attach all coming space tokens to their last preceding
|
|
# real token (which should be on the top of the stack)
|
|
while is_space_token(this.B_(0)):
|
|
this.add_arc(this.S(0),this.B(0),0)
|
|
this.push()
|
|
this.pop()
|
|
# the rare case: we're at the beginning of a document:
|
|
# space tokens are attached to the first real token on the buffer
|
|
elif this.stack_depth() == 0:
|
|
# store all space tokens on the stack until a real token shows up
|
|
# or the last token on the buffer is reached
|
|
while is_space_token(this.B_(0)) and this.buffer_length() > 1:
|
|
this.push()
|
|
# empty the stack by attaching all space tokens to the
|
|
# first token on the buffer
|
|
# boundary case: if all tokens are space tokens, the last one
|
|
# becomes the head of all others
|
|
while this.stack_depth() > 0:
|
|
this.add_arc(this.B(0),this.S(0),0)
|
|
this.pop()
|
|
# move the first token onto the stack
|
|
this.push()
|
|
|
|
elif this.stack_depth() == 0:
|
|
# for one token sentences (?)
|
|
if this.buffer_length() == 1:
|
|
this.push()
|
|
this.pop()
|
|
# with an empty stack and a non-empty buffer
|
|
# only shift is valid anyway
|
|
elif (this.length - this._b_i) >= 1:
|
|
this.push()
|
|
|
|
else: # can this even happen?
|
|
break
|