mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-10 15:14:56 +03:00
WIP on adding split-token actions to parser
This patch starts getting the StateC object ready to split tokens. The split function is implemented by pushing indices into the buffer that indicate an out-of-length token. Still todo: * Update the oracles * Update GoldParseC * Interpret the parse once it's complete * Add retokenizer.split() method
This commit is contained in:
parent
3e3af01681
commit
e5ad35787c
|
@ -13,7 +13,6 @@ from ..symbols cimport punct
|
||||||
from ..attrs cimport IS_SPACE
|
from ..attrs cimport IS_SPACE
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
|
|
||||||
cdef void _split(StateC* this, int i, int n) nogil
|
|
||||||
|
|
||||||
cdef inline bint is_space_token(const TokenC* token) nogil:
|
cdef inline bint is_space_token(const TokenC* token) nogil:
|
||||||
return Lexeme.c_check_flag(token.lex, IS_SPACE)
|
return Lexeme.c_check_flag(token.lex, IS_SPACE)
|
||||||
|
@ -44,12 +43,14 @@ cdef cppclass StateC:
|
||||||
Entity* _ents
|
Entity* _ents
|
||||||
TokenC _empty_token
|
TokenC _empty_token
|
||||||
RingBufferC _hist
|
RingBufferC _hist
|
||||||
|
int buffer_length
|
||||||
|
int max_split
|
||||||
int length
|
int length
|
||||||
int offset
|
int offset
|
||||||
int _s_i
|
int _s_i
|
||||||
int _b_i
|
int _b_i
|
||||||
int _e_i
|
int _e_i
|
||||||
int _break
|
int _n_until_break
|
||||||
|
|
||||||
__init__(const TokenC* sent, int length) nogil:
|
__init__(const TokenC* sent, int length) nogil:
|
||||||
cdef int PADDING = 5
|
cdef int PADDING = 5
|
||||||
|
@ -78,7 +79,9 @@ cdef cppclass StateC:
|
||||||
this._stack += PADDING
|
this._stack += PADDING
|
||||||
this.shifted += PADDING
|
this.shifted += PADDING
|
||||||
this.length = length
|
this.length = length
|
||||||
this._break = -1
|
this.buffer_length = length
|
||||||
|
this.max_split = 0
|
||||||
|
this._n_until_break = -1
|
||||||
this._s_i = 0
|
this._s_i = 0
|
||||||
this._b_i = 0
|
this._b_i = 0
|
||||||
this._e_i = 0
|
this._e_i = 0
|
||||||
|
@ -160,7 +163,9 @@ cdef cppclass StateC:
|
||||||
return this._stack[this._s_i - (i+1)]
|
return this._stack[this._s_i - (i+1)]
|
||||||
|
|
||||||
int B(int i) nogil const:
|
int B(int i) nogil const:
|
||||||
if (i + this._b_i) >= this.length:
|
if i >= this.buffer_length:
|
||||||
|
return -1
|
||||||
|
if this._n_until_break != -1 and i >= this._n_until_break:
|
||||||
return -1
|
return -1
|
||||||
return this._buffer[this._b_i + i]
|
return this._buffer[this._b_i + i]
|
||||||
|
|
||||||
|
@ -254,13 +259,13 @@ cdef cppclass StateC:
|
||||||
return this._s_i <= 0
|
return this._s_i <= 0
|
||||||
|
|
||||||
bint eol() nogil const:
|
bint eol() nogil const:
|
||||||
return this.buffer_length() == 0
|
return this.buffer_length == 0 or this.at_break()
|
||||||
|
|
||||||
bint at_break() nogil const:
|
bint at_break() nogil const:
|
||||||
return this._break != -1
|
return this._n_until_break == 0
|
||||||
|
|
||||||
bint is_final() nogil const:
|
bint is_final() nogil const:
|
||||||
return this.stack_depth() <= 0 and this._b_i >= this.length
|
return this.stack_depth() <= 0 and this.buffer_length == 0
|
||||||
|
|
||||||
bint has_head(int i) nogil const:
|
bint has_head(int i) nogil const:
|
||||||
return this.safe_get(i).head != 0
|
return this.safe_get(i).head != 0
|
||||||
|
@ -282,12 +287,6 @@ cdef cppclass StateC:
|
||||||
int stack_depth() nogil const:
|
int stack_depth() nogil const:
|
||||||
return this._s_i
|
return this._s_i
|
||||||
|
|
||||||
int buffer_length() nogil const:
|
|
||||||
if this._break != -1:
|
|
||||||
return this._break - this._b_i
|
|
||||||
else:
|
|
||||||
return this.length - this._b_i
|
|
||||||
|
|
||||||
uint64_t hash() nogil const:
|
uint64_t hash() nogil const:
|
||||||
cdef TokenC[11] sig
|
cdef TokenC[11] sig
|
||||||
sig[0] = this.S_(2)[0]
|
sig[0] = this.S_(2)[0]
|
||||||
|
@ -311,46 +310,62 @@ cdef cppclass StateC:
|
||||||
return ring_get(&this._hist, i)
|
return ring_get(&this._hist, i)
|
||||||
|
|
||||||
void push() nogil:
|
void push() nogil:
|
||||||
if this.B(0) != -1:
|
if this.buffer_length != 0:
|
||||||
this._stack[this._s_i] = this.B(0)
|
this._stack[this._s_i] = this._buffer[this._b_i]
|
||||||
|
if this._n_until_break != -1:
|
||||||
|
this._n_until_break -= 1
|
||||||
this._s_i += 1
|
this._s_i += 1
|
||||||
this._b_i += 1
|
this._b_i += 1
|
||||||
|
this.buffer_length -= 1
|
||||||
if this.B_(0).sent_start == 1:
|
if this.B_(0).sent_start == 1:
|
||||||
this.set_break(this.B(0))
|
this.set_break(0)
|
||||||
if this._b_i > this._break:
|
|
||||||
this._break = -1
|
|
||||||
|
|
||||||
void split(int i, int n) nogil:
|
void split(int i, int n) nogil:
|
||||||
'''Split token i of the buffer into N pieces.'''
|
'''Split token i of the buffer into N pieces.'''
|
||||||
# Let's say we've got a length 10 sentence.
|
# Let's say we've got a length 10 sentence. 4 is start of buffer.
|
||||||
# state.split(5, 2)
|
# We do: state.split(1, 2)
|
||||||
# Before: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
#
|
||||||
# After: [0, 1, 2, 3, 4, 5.0, 5.1, 5.2, 6, 7, 8, 9, 10]
|
# Old buffer: 4,5,6,7,8,9
|
||||||
# Sentence grows to length 12.
|
# New buffer: 4,5,13,22,6,7,8,9
|
||||||
# Words 6-10 move to positions 8-12
|
if (this._b_i+5*2) < n:
|
||||||
# Words 0-5 stay where they are.
|
with gil:
|
||||||
cdef int PADDING = 5
|
raise NotImplementedError
|
||||||
cdef int j
|
# Let's say we're at token index 4. this._b_i will be 4, so that we
|
||||||
# Unwind the padding, so we can work with the original pointer.
|
# point forward into the buffer. To insert, we don't need to reallocate
|
||||||
this._sent -= PADDING
|
# -- we have space at the start; we can just shift the tokens between
|
||||||
this._sent = <TokenC*>realloc(this._sent,
|
# where we are at the buffer and where the split starts backwards to
|
||||||
((this.length+n+1) + (PADDING * 2)) * sizeof(TokenC))
|
# make room.
|
||||||
for j in range(this.length+PADDING*2, this.length+n+1+PADDING*2):
|
#
|
||||||
this._sent[j] = this._empty_token
|
# For b_i=4, i=1, n=2 we want to have:
|
||||||
# Put the start padding back in
|
# Old buffer: [_, _, _, _, 4, 5, 6, 7, 8, 9] and b_i=4
|
||||||
this._sent += PADDING
|
# New buffer: [_, _, 4, 5, 13, 22, 6, 7, 8, 9] and b_i=2
|
||||||
# In our example, we want to move words 6-10 to 8-12. So we must move
|
# b_i will always move back by n in total, as that's
|
||||||
# a block of 4 words.
|
# the size of the gap we're creating.
|
||||||
cdef int n_moved = this.length - (i+1)
|
# The number of values we have to copy will be i+1
|
||||||
cdef int move_from = i+1
|
# Another way to see it:
|
||||||
cdef int move_to = i+n+1
|
# For b_i=4, i=1, n=2
|
||||||
memmove(&this._sent[move_to], &this._sent[move_from],
|
# buffer[2:4] = buffer[4:6]
|
||||||
n_moved*sizeof(TokenC))
|
# buffer[4:6] = new_tokens
|
||||||
# Now copy the token that has been split into its neighbours.
|
# For b_i=7, i=1, n=1
|
||||||
for j in range(i+1, i+n+1):
|
# buffer[6:8] = buffer[7:9]
|
||||||
this._sent[j] = this._sent[i]
|
# buffer[8:9] = new_tokens
|
||||||
# Finally, adjust length.
|
# For b_i=3, i=1, n=3
|
||||||
this.length += n
|
# buffer[0:2] = buffer[3:5]
|
||||||
|
# buffer[2:5] = new_tokens
|
||||||
|
# For b_i=5, i=3, n=1
|
||||||
|
# buffer[4:8] = buffer[5:9]
|
||||||
|
# buffer[8:9] = new_tokens
|
||||||
|
cdef int target = this.B(i)
|
||||||
|
this._b_i -= n
|
||||||
|
memmove(&this._buffer[this._b_i],
|
||||||
|
&this._buffer[this._b_i+n], (i+1)*sizeof(this._buffer[0]))
|
||||||
|
cdef int subtoken, new_token
|
||||||
|
for subtoken in range(n):
|
||||||
|
new_token = (subtoken+1) * this.length + target
|
||||||
|
this._buffer[this._b_i+(i+1)+subtoken] = new_token
|
||||||
|
this.buffer_length += n
|
||||||
|
if this._n_until_break != -1:
|
||||||
|
this._n_until_break += n
|
||||||
|
|
||||||
void pop() nogil:
|
void pop() nogil:
|
||||||
if this._s_i >= 1:
|
if this._s_i >= 1:
|
||||||
|
@ -361,6 +376,9 @@ cdef cppclass StateC:
|
||||||
this._buffer[this._b_i] = this.S(0)
|
this._buffer[this._b_i] = this.S(0)
|
||||||
this._s_i -= 1
|
this._s_i -= 1
|
||||||
this.shifted[this.B(0)] = True
|
this.shifted[this.B(0)] = True
|
||||||
|
this.buffer_length += 1
|
||||||
|
if this._n_until_break != -1:
|
||||||
|
this._n_until_break += 1
|
||||||
|
|
||||||
void add_arc(int head, int child, attr_t label) nogil:
|
void add_arc(int head, int child, attr_t label) nogil:
|
||||||
if this.has_head(child):
|
if this.has_head(child):
|
||||||
|
@ -424,12 +442,13 @@ cdef cppclass StateC:
|
||||||
this._sent[i].ent_type = ent_type
|
this._sent[i].ent_type = ent_type
|
||||||
|
|
||||||
void set_break(int i) nogil:
|
void set_break(int i) nogil:
|
||||||
if 0 <= i < this.length:
|
if 0 <= i < this.buffer_length:
|
||||||
this._sent[i].sent_start = 1
|
this._sent[this.B_(i).l_edge].sent_start = 1
|
||||||
this._break = this._b_i
|
this._n_until_break = i
|
||||||
|
|
||||||
void clone(const StateC* src) nogil:
|
void clone(const StateC* src) nogil:
|
||||||
this.length = src.length
|
this.length = src.length
|
||||||
|
this.buffer_length = src.buffer_length
|
||||||
memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
|
memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
|
||||||
memcpy(this._stack, src._stack, this.length * sizeof(int))
|
memcpy(this._stack, src._stack, this.length * sizeof(int))
|
||||||
memcpy(this._buffer, src._buffer, this.length * sizeof(int))
|
memcpy(this._buffer, src._buffer, this.length * sizeof(int))
|
||||||
|
@ -438,7 +457,7 @@ cdef cppclass StateC:
|
||||||
this._b_i = src._b_i
|
this._b_i = src._b_i
|
||||||
this._s_i = src._s_i
|
this._s_i = src._s_i
|
||||||
this._e_i = src._e_i
|
this._e_i = src._e_i
|
||||||
this._break = src._break
|
this._n_until_break = src._n_until_break
|
||||||
this.offset = src.offset
|
this.offset = src.offset
|
||||||
this._empty_token = src._empty_token
|
this._empty_token = src._empty_token
|
||||||
|
|
||||||
|
@ -450,9 +469,9 @@ cdef cppclass StateC:
|
||||||
# then make the last space token the head of all others
|
# then make the last space token the head of all others
|
||||||
|
|
||||||
while is_space_token(this.B_(0)) \
|
while is_space_token(this.B_(0)) \
|
||||||
or this.buffer_length() == 0 \
|
or this.eol() \
|
||||||
or this.stack_depth() == 0:
|
or this.stack_depth() == 0:
|
||||||
if this.buffer_length() == 0:
|
if this.eol():
|
||||||
# remove the last sentence's root from the stack
|
# remove the last sentence's root from the stack
|
||||||
if this.stack_depth() == 1:
|
if this.stack_depth() == 1:
|
||||||
this.pop()
|
this.pop()
|
||||||
|
@ -463,7 +482,7 @@ cdef cppclass StateC:
|
||||||
else:
|
else:
|
||||||
this.unshift()
|
this.unshift()
|
||||||
# stack is empty but there is another sentence on the buffer
|
# stack is empty but there is another sentence on the buffer
|
||||||
elif (this.length - this._b_i) >= 1:
|
elif this.buffer_length != 0:
|
||||||
this.push()
|
this.push()
|
||||||
else: # stack empty and nothing else coming
|
else: # stack empty and nothing else coming
|
||||||
break
|
break
|
||||||
|
@ -483,7 +502,7 @@ cdef cppclass StateC:
|
||||||
elif this.stack_depth() == 0:
|
elif this.stack_depth() == 0:
|
||||||
# store all space tokens on the stack until a real token shows up
|
# store all space tokens on the stack until a real token shows up
|
||||||
# or the last token on the buffer is reached
|
# or the last token on the buffer is reached
|
||||||
while is_space_token(this.B_(0)) and this.buffer_length() > 1:
|
while is_space_token(this.B_(0)) and this.buffer_length > 1:
|
||||||
this.push()
|
this.push()
|
||||||
# empty the stack by attaching all space tokens to the
|
# empty the stack by attaching all space tokens to the
|
||||||
# first token on the buffer
|
# first token on the buffer
|
||||||
|
@ -497,12 +516,12 @@ cdef cppclass StateC:
|
||||||
|
|
||||||
elif this.stack_depth() == 0:
|
elif this.stack_depth() == 0:
|
||||||
# for one token sentences (?)
|
# for one token sentences (?)
|
||||||
if this.buffer_length() == 1:
|
if this.buffer_length == 1:
|
||||||
this.push()
|
this.push()
|
||||||
this.pop()
|
this.pop()
|
||||||
# with an empty stack and a non-empty buffer
|
# with an empty stack and a non-empty buffer
|
||||||
# only shift is valid anyway
|
# only shift is valid anyway
|
||||||
elif (this.length - this._b_i) >= 1:
|
elif this.buffer_length != 0:
|
||||||
this.push()
|
this.push()
|
||||||
|
|
||||||
else: # can this even happen?
|
else: # can this even happen?
|
||||||
|
|
|
@ -66,7 +66,7 @@ cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) no
|
||||||
cdef weight_t pop_cost(StateClass stcls, const GoldParseC* gold, int target) nogil:
|
cdef weight_t pop_cost(StateClass stcls, const GoldParseC* gold, int target) nogil:
|
||||||
cdef weight_t cost = 0
|
cdef weight_t cost = 0
|
||||||
cdef int i, B_i
|
cdef int i, B_i
|
||||||
for i in range(stcls.buffer_length()):
|
for i in range(stcls.c.buffer_length):
|
||||||
B_i = stcls.B(i)
|
B_i = stcls.B(i)
|
||||||
cost += gold.heads[B_i] == target
|
cost += gold.heads[B_i] == target
|
||||||
cost += gold.heads[target] == B_i
|
cost += gold.heads[target] == B_i
|
||||||
|
@ -118,7 +118,7 @@ cdef class Shift:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
sent_start = st._sent[st.B_(0).l_edge].sent_start
|
sent_start = st._sent[st.B_(0).l_edge].sent_start
|
||||||
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and sent_start != 1
|
return st.buffer_length >= 2 and not st.shifted[st.B(0)] and sent_start != 1
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, attr_t label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
|
@ -137,10 +137,11 @@ cdef class Shift:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
if gold.fused_tokens[s.B(1)] == label:
|
return 0
|
||||||
return 0
|
#if gold.fused_tokens[s.B(1)] == label:
|
||||||
else:
|
# return 0
|
||||||
return 1
|
#else:
|
||||||
|
# return 1
|
||||||
|
|
||||||
|
|
||||||
cdef class Reduce:
|
cdef class Reduce:
|
||||||
|
@ -265,7 +266,7 @@ cdef class Break:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, attr_t label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.set_break(st.B_(0).l_edge)
|
st.set_break(0)
|
||||||
st.fast_forward()
|
st.fast_forward()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -278,7 +279,7 @@ cdef class Break:
|
||||||
cdef int i, j, S_i, B_i
|
cdef int i, j, S_i, B_i
|
||||||
for i in range(s.stack_depth()):
|
for i in range(s.stack_depth()):
|
||||||
S_i = s.S(i)
|
S_i = s.S(i)
|
||||||
for j in range(s.buffer_length()):
|
for j in range(s.c.buffer_length):
|
||||||
B_i = s.B(j)
|
B_i = s.B(j)
|
||||||
cost += gold.heads[S_i] == B_i
|
cost += gold.heads[S_i] == B_i
|
||||||
cost += gold.heads[B_i] == S_i
|
cost += gold.heads[B_i] == S_i
|
||||||
|
|
|
@ -10,6 +10,7 @@ from ..vocab cimport EMPTY_LEXEME
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
|
||||||
|
|
||||||
|
@cython.final
|
||||||
cdef class StateClass:
|
cdef class StateClass:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef StateC* c
|
cdef StateC* c
|
||||||
|
@ -105,7 +106,7 @@ cdef class StateClass:
|
||||||
return self.c.stack_depth()
|
return self.c.stack_depth()
|
||||||
|
|
||||||
cdef inline int buffer_length(self) nogil:
|
cdef inline int buffer_length(self) nogil:
|
||||||
return self.c.buffer_length()
|
return self.c.buffer_length
|
||||||
|
|
||||||
cdef inline void push(self) nogil:
|
cdef inline void push(self) nogil:
|
||||||
self.c.push()
|
self.c.push()
|
||||||
|
|
|
@ -8,13 +8,14 @@ from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
|
|
||||||
cdef class StateClass:
|
cdef class StateClass:
|
||||||
def __init__(self, Doc doc=None, int offset=0):
|
def __init__(self, Doc doc=None, int offset=0, int max_split=0):
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
self.mem = mem
|
self.mem = mem
|
||||||
self._borrowed = 0
|
self._borrowed = 0
|
||||||
if doc is not None:
|
if doc is not None:
|
||||||
self.c = new StateC(doc.c, doc.length)
|
self.c = new StateC(doc.c, doc.length)
|
||||||
self.c.offset = offset
|
self.c.offset = offset
|
||||||
|
self.c.max_split = max_split
|
||||||
|
|
||||||
def __dealloc__(self):
|
def __dealloc__(self):
|
||||||
if self._borrowed != 1:
|
if self._borrowed != 1:
|
||||||
|
@ -39,6 +40,14 @@ cdef class StateClass:
|
||||||
if fast_forward:
|
if fast_forward:
|
||||||
self.c.fast_forward()
|
self.c.fast_forward()
|
||||||
|
|
||||||
|
def unshift(self, fast_forward=True):
|
||||||
|
self.c.unshift()
|
||||||
|
if fast_forward:
|
||||||
|
self.c.fast_forward()
|
||||||
|
|
||||||
|
def set_break(self, int i):
|
||||||
|
self.c.set_break(i)
|
||||||
|
|
||||||
def split_token(self, int i, int n, fast_forward=True):
|
def split_token(self, int i, int n, fast_forward=True):
|
||||||
self.c.split(i, n)
|
self.c.split(i, n)
|
||||||
if fast_forward:
|
if fast_forward:
|
||||||
|
@ -57,7 +66,7 @@ cdef class StateClass:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def queue(self):
|
def queue(self):
|
||||||
return {self.B(i) for i in range(self.c.buffer_length())}
|
return [self.B(i) for i in range(self.c.buffer_length)]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def token_vector_lenth(self):
|
def token_vector_lenth(self):
|
||||||
|
|
|
@ -32,37 +32,12 @@ def test_pop():
|
||||||
assert state.get_S(0) == 0
|
assert state.get_S(0) == 0
|
||||||
|
|
||||||
|
|
||||||
def toy_split():
|
|
||||||
def _realloc(data, new_size):
|
|
||||||
additions = new_size - len(data)
|
|
||||||
return data + ['']*additions
|
|
||||||
length = 10
|
|
||||||
sent = list(range(length))
|
|
||||||
sent = [None]*pad + sent + [None]*pad # pad
|
|
||||||
ptr = pad
|
|
||||||
i = 5
|
|
||||||
n = 2
|
|
||||||
|
|
||||||
ptr -= pad
|
|
||||||
i += pad
|
|
||||||
sent = _realloc(sent, length+n+(pad*2))
|
|
||||||
n_moved = (length + (pad*2)) - i+1
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_split():
|
def test_split():
|
||||||
'''state.split_token should take the ith word of the buffer, and split it
|
'''state.split_token should take the ith word of the buffer, and split it
|
||||||
into n+1 pieces. n is 0-indexed, i.e. split(i, 0) is a noop, and split(i, 1)
|
into n+1 pieces. n is 0-indexed, i.e. split(i, 0) is a noop, and split(i, 1)
|
||||||
creates 1 new token.'''
|
creates 1 new token.'''
|
||||||
doc = get_doc('abcd')
|
doc = get_doc('abcd')
|
||||||
state = StateClass(doc)
|
state = StateClass(doc, max_split=3)
|
||||||
assert len(state) == len(doc)
|
assert state.queue == [0, 1, 2, 3]
|
||||||
state.split_token(1, 2)
|
state.split_token(1, 2, fast_forward=False)
|
||||||
assert len(state) == len(doc)+2
|
assert state.queue == [0, 1, 1*4+1, 2*4+1, 2, 3]
|
||||||
stdoc = state.get_doc(doc.vocab)
|
|
||||||
assert stdoc[0].text == 'a'
|
|
||||||
assert stdoc[1].text == 'b'
|
|
||||||
assert stdoc[2].text == 'b'
|
|
||||||
assert stdoc[3].text == 'b'
|
|
||||||
assert stdoc[4].text == 'c'
|
|
||||||
assert stdoc[5].text == 'd'
|
|
||||||
|
|
|
@ -320,7 +320,7 @@ cdef class Doc:
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
return 1.0
|
return 1.0
|
||||||
|
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user