mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Fix serializer
This commit is contained in:
parent
b16ae75824
commit
4efb391994
|
@ -1,4 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
|
# cython: infer_types=True
|
||||||
|
# cython: bounds_check=False
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
|
@ -565,7 +567,7 @@ cdef class Doc:
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
self.c[i] = parsed[i]
|
self.c[i] = parsed[i]
|
||||||
|
|
||||||
def from_array(self, attrs, array):
|
def from_array(self, attrs, int[:, :] array):
|
||||||
"""
|
"""
|
||||||
Write to a `Doc` object, from an `(M, N)` array of attributes.
|
Write to a `Doc` object, from an `(M, N)` array of attributes.
|
||||||
"""
|
"""
|
||||||
|
@ -573,34 +575,23 @@ cdef class Doc:
|
||||||
cdef attr_id_t attr_id
|
cdef attr_id_t attr_id
|
||||||
cdef TokenC* tokens = self.c
|
cdef TokenC* tokens = self.c
|
||||||
cdef int length = len(array)
|
cdef int length = len(array)
|
||||||
cdef attr_t[:] values
|
# Get set up for fast loading
|
||||||
|
cdef Pool mem = Pool()
|
||||||
|
cdef int n_attrs = len(attrs)
|
||||||
|
attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t))
|
||||||
|
for i, attr_id in enumerate(attrs):
|
||||||
|
attr_ids[i] = attr_id
|
||||||
|
# Now load the data
|
||||||
|
for i in range(self.length):
|
||||||
|
token = &self.c[i]
|
||||||
|
for j in range(n_attrs):
|
||||||
|
Token.set_struct_attr(token, attr_ids[j], array[i, j])
|
||||||
|
# Auxiliary loading logic
|
||||||
for col, attr_id in enumerate(attrs):
|
for col, attr_id in enumerate(attrs):
|
||||||
values = array[:, col]
|
if attr_id == TAG:
|
||||||
if attr_id == HEAD:
|
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
tokens[i].head = values[i]
|
if array[i, col] != 0:
|
||||||
if values[i] >= 1:
|
self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
|
||||||
tokens[i + values[i]].l_kids += 1
|
|
||||||
elif values[i] < 0:
|
|
||||||
tokens[i + values[i]].r_kids += 1
|
|
||||||
elif attr_id == TAG:
|
|
||||||
for i in range(length):
|
|
||||||
if values[i] != 0:
|
|
||||||
self.vocab.morphology.assign_tag(&tokens[i], values[i])
|
|
||||||
elif attr_id == POS:
|
|
||||||
for i in range(length):
|
|
||||||
tokens[i].pos = <univ_pos_t>values[i]
|
|
||||||
elif attr_id == DEP:
|
|
||||||
for i in range(length):
|
|
||||||
tokens[i].dep = values[i]
|
|
||||||
elif attr_id == ENT_IOB:
|
|
||||||
for i in range(length):
|
|
||||||
tokens[i].ent_iob = values[i]
|
|
||||||
elif attr_id == ENT_TYPE:
|
|
||||||
for i in range(length):
|
|
||||||
tokens[i].ent_type = values[i]
|
|
||||||
else:
|
|
||||||
raise ValueError("Unknown attribute ID: %d" % attr_id)
|
|
||||||
set_children_from_heads(self.c, self.length)
|
set_children_from_heads(self.c, self.length)
|
||||||
self.is_parsed = bool(HEAD in attrs or DEP in attrs)
|
self.is_parsed = bool(HEAD in attrs or DEP in attrs)
|
||||||
self.is_tagged = bool(TAG in attrs or POS in attrs)
|
self.is_tagged = bool(TAG in attrs or POS in attrs)
|
||||||
|
@ -645,9 +636,9 @@ cdef class Doc:
|
||||||
self.push_back(lex, has_space)
|
self.push_back(lex, has_space)
|
||||||
|
|
||||||
start = end + has_space
|
start = end + has_space
|
||||||
self.from_array(attrs[:, 2:],
|
self.from_array([TAG,LEMMA,HEAD,DEP,ENT_IOB,ENT_TYPE],
|
||||||
[TAG,LEMMA,HEAD,DEP,ENT_IOB,ENT_TYPE])
|
attrs[:, 2:])
|
||||||
|
return self
|
||||||
|
|
||||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in New Issue
Block a user