mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-21 05:23:06 +03:00
Convert all individual values explicitly to uint64 for array-based doc representations
This commit is contained in:
parent
4b2097a271
commit
a7215e345a
|
@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
|
||||||
|
|
||||||
# head before start
|
# head before start
|
||||||
arr = doc.to_array(["HEAD"])
|
arr = doc.to_array(["HEAD"])
|
||||||
arr[0] = -1
|
arr[0] = numpy.array(-1).astype(numpy.uint64)
|
||||||
doc_from_array = Doc(en_vocab, words=words)
|
doc_from_array = Doc(en_vocab, words=words)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
doc_from_array.from_array(["HEAD"], arr)
|
doc_from_array.from_array(["HEAD"], arr)
|
||||||
|
|
||||||
# head after end
|
# head after end
|
||||||
arr = doc.to_array(["HEAD"])
|
arr = doc.to_array(["HEAD"])
|
||||||
arr[0] = 5
|
arr[0] = numpy.array(5).astype(numpy.uint64)
|
||||||
doc_from_array = Doc(en_vocab, words=words)
|
doc_from_array = Doc(en_vocab, words=words)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
doc_from_array.from_array(["HEAD"], arr)
|
doc_from_array.from_array(["HEAD"], arr)
|
||||||
|
|
|
@ -361,9 +361,9 @@ cdef class Doc:
|
||||||
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
if attrs.ndim == 1:
|
if attrs.ndim == 1:
|
||||||
attrs[i] = annot[i]
|
attrs[i] = numpy.array(annot[i]).astype(numpy.uint64)
|
||||||
else:
|
else:
|
||||||
attrs[i, j] = annot[i]
|
attrs[i, j] = numpy.array(annot[i]).astype(numpy.uint64)
|
||||||
elif annot is morphs:
|
elif annot is morphs:
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
morph_key = vocab.morphology.add(morphs[i])
|
morph_key = vocab.morphology.add(morphs[i])
|
||||||
|
@ -981,7 +981,7 @@ cdef class Doc:
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
token = &self.c[i]
|
token = &self.c[i]
|
||||||
for j in range(nr_attr):
|
for j in range(nr_attr):
|
||||||
c_output[i*nr_attr + j] = get_token_attr(token, c_attr_ids[j])
|
c_output[i*nr_attr + j] = numpy.array(get_token_attr(token, c_attr_ids[j])).astype(numpy.uint64)
|
||||||
# Handle 1d case
|
# Handle 1d case
|
||||||
return output if len(attr_ids) >= 2 else output.reshape((self.length,))
|
return output if len(attr_ids) >= 2 else output.reshape((self.length,))
|
||||||
|
|
||||||
|
@ -1559,7 +1559,7 @@ cdef class Doc:
|
||||||
for j, (attr, annot) in enumerate(token_annotations.items()):
|
for j, (attr, annot) in enumerate(token_annotations.items()):
|
||||||
if attr is HEAD:
|
if attr is HEAD:
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
array[i, j] = annot[i]
|
array[i, j] = numpy.array(annot[i]).astype(numpy.uint64)
|
||||||
elif attr is MORPH:
|
elif attr is MORPH:
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
array[i, j] = self.vocab.morphology.add(annot[i])
|
array[i, j] = self.vocab.morphology.add(annot[i])
|
||||||
|
|
|
@ -299,7 +299,7 @@ cdef class Span:
|
||||||
for ancestor in ancestors:
|
for ancestor in ancestors:
|
||||||
ancestor_i = ancestor.i - self.c.start
|
ancestor_i = ancestor.i - self.c.start
|
||||||
if ancestor_i in range(length):
|
if ancestor_i in range(length):
|
||||||
array[i, head_col] = ancestor_i - i
|
array[i, head_col] = numpy.array(ancestor_i - i).astype(numpy.uint64)
|
||||||
|
|
||||||
# if there is no appropriate ancestor, define a new artificial root
|
# if there is no appropriate ancestor, define a new artificial root
|
||||||
value = array[i, head_col]
|
value = array[i, head_col]
|
||||||
|
@ -307,7 +307,7 @@ cdef class Span:
|
||||||
new_root = old_to_new_root.get(ancestor_i, None)
|
new_root = old_to_new_root.get(ancestor_i, None)
|
||||||
if new_root is not None:
|
if new_root is not None:
|
||||||
# take the same artificial root as a previous token from the same sentence
|
# take the same artificial root as a previous token from the same sentence
|
||||||
array[i, head_col] = new_root - i
|
array[i, head_col] = numpy.array(new_root - i).astype(numpy.uint64)
|
||||||
else:
|
else:
|
||||||
# set this token as the new artificial root
|
# set this token as the new artificial root
|
||||||
array[i, head_col] = 0
|
array[i, head_col] = 0
|
||||||
|
|
|
@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
if key not in IDS:
|
if key not in IDS:
|
||||||
raise ValueError(Errors.E974.format(obj="token", key=key))
|
raise ValueError(Errors.E974.format(obj="token", key=key))
|
||||||
elif key in ["ORTH", "SPACY"]:
|
elif key in ["ORTH", "SPACY"]:
|
||||||
pass
|
continue
|
||||||
elif key == "HEAD":
|
elif key == "HEAD":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
|
row = [h-i if h is not None else 0 for i, h in enumerate(value)]
|
||||||
elif key == "DEP":
|
elif key == "DEP":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
|
row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
|
||||||
elif key == "SENT_START":
|
elif key == "SENT_START":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([to_ternary_int(v) for v in value])
|
row = [to_ternary_int(v) for v in value]
|
||||||
elif key == "MORPH":
|
elif key == "MORPH":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.morphology.add(v) for v in value])
|
row = [vocab.morphology.add(v) for v in value]
|
||||||
else:
|
else:
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
if not all(isinstance(v, str) for v in value):
|
if not all(isinstance(v, str) for v in value):
|
||||||
types = set([type(v) for v in value])
|
types = set([type(v) for v in value])
|
||||||
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
||||||
values.append([vocab.strings.add(v) for v in value])
|
row = [vocab.strings.add(v) for v in value]
|
||||||
array = numpy.asarray(values, dtype="uint64")
|
values.append([numpy.array(v).astype(numpy.uint64) for v in row])
|
||||||
|
array = numpy.array(values, dtype=numpy.uint64)
|
||||||
return attrs, array.T
|
return attrs, array.T
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user